In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import gc

In [None]:
file_name = 'Data/survey_results_public_2019.csv'
df = pd.read_csv(file_name)

In [None]:
df.head()

In [None]:
df = df.dropna(subset=['ConvertedComp'], axis=0)

In [None]:
y = df['ConvertedComp']
df = df.drop(['Respondent', 'CompTotal', 'ConvertedComp', 'SurveyLength', 'SurveyEase'], axis=1)

For numerical missing values, we are filling the data with the mean of that column

In [None]:
num_vars = df.select_dtypes(include=['float', 'int']).columns
for col in num_vars:
    df[col].fillna((df[col].mean()), inplace=True)

For categorical missing values, we are adding additional columns (proportional to the number of categories) in order to represent the category

In [None]:
cat_vars = df.select_dtypes(include=['object']).copy().columns
for var in  cat_vars:
    # for each cat add dummy var, drop original column
    df = pd.concat([df.drop(var, axis=1), pd.get_dummies(df[var], prefix=var, prefix_sep='_', drop_first=True)], axis=1)
    gc.collect()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size = 0.2, random_state=42)

In [None]:
#fit the model and obtain pred response
lm_model = LinearRegression(normalize=True)
lm_model.fit(X_train, y_train)

y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)


In [None]:
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)
print('The test r2 score is: {}'.format(test_score))
print('The train r2 score is: {}'.format(train_score))