###### Importing the libraries

In [57]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoLars
from sklearn.linear_model import PoissonRegressor
from sklearn.linear_model import GammaRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.svm import NuSVR
from sklearn.svm import LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV

###### Loading the data and defining numerical and discrete variables 

In [58]:
df = pd.read_csv('FinalProject.csv')

numerical_variables = ['A3.2', 'A4.2', 'B1_Age', 'B6_GPA']

discrete_variables = ['RemoteTrad', 'Subject Code', 'Class', 'Quarter', 'Year', 'Section',
       'A1_Status', 'A2_Major', 'A3.1', 'A4.1', 'A5.1', 'B2_Gender', 'B3.1_USStatus', 
       'B3.2_Country', 'B4.1_Race', 'B5_Income', 'B7_MotherEd', 'B8_FatherEd', 'B9_SocioClass',
       'B10_MajorSelection', 'B11.1', 'B11.2', 'B11.3', 'B11.4', 'B11.5', 'B11.6', 'B12.1', 'B12.2', 
       'B12.3', 'B12.4', 'B12.5', 'B12.6', 'B12.7']

###### Handling missing values, missing discrete value to nan and missing numerical value to 0

In [59]:
for column in discrete_variables:
    df = df.astype({column:'str'})
            
for column in numerical_variables:
    df[column] = pd.to_numeric(df[column], errors='coerce')
    
for row in range(df.shape[0]):
    for column in numerical_variables:
        if np.isnan(df.loc[row,column]):
            df.loc[row,column] = 0

###### Splitting the data into train, validation and test

In [49]:
train = []
validation = []
test = []
for row in range(df.shape[0]):
    if(row % 5 == 3):
        validation.append(df.loc[row,:].values)
    elif(row % 5 == 4):
        test.append(df.loc[row,:].values)
    else:
        train.append(df.loc[row,:].values)
df_train = pd.DataFrame(train, columns = df.columns)
df_validation = pd.DataFrame(validation, columns = df.columns)
df_test = pd.DataFrame(test, columns = df.columns)
Y_train = df_train.loc[:,'Effort'].to_numpy()
Y_validation = df_validation.loc[:,'Effort'].to_numpy()
Y_test = df_test.loc[:,'Effort'].to_numpy()
print('\nDataFrame train :', df_train.shape)
print('\nDataFrame validation :', df_validation.shape)
print('\nDataFrame test :', df_test.shape)


DataFrame train : (795, 44)

DataFrame validation : (265, 44)

DataFrame test : (264, 44)


###### Onehot encoding for discrete features

In [50]:
# define one hot encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

# transform data
onehot = pd.DataFrame(encoder.fit_transform(df_train.loc[:,discrete_variables]))
X_train_numerical = df_train.loc[:,numerical_variables]
X_train_joint = X_train_numerical.join(onehot)

# transform validation data
onehot_validation = pd.DataFrame(encoder.transform(df_validation.loc[:,discrete_variables]))
X_validation_numerical = df_validation.loc[:,numerical_variables]
X_validation_joint = X_validation_numerical.join(onehot_validation)

# transform test data
onehot_test = pd.DataFrame(encoder.transform(df_test.loc[:,discrete_variables]))
X_test_numerical = df_test.loc[:,numerical_variables]
X_test_joint = X_test_numerical.join(onehot_test)

###### Sandard scaling

In [51]:
# normalize the features by subtracting the mean and dividing by the standard deviation
scaler = preprocessing.StandardScaler().fit(X_train_joint)
X_train_scaled = scaler.transform(X_train_joint)
X_validation_scaled = scaler.transform(X_validation_joint)
X_test_scaled = scaler.transform(X_test_joint)

###### Defining Ordinary Least Squares linear regression model as baseline

In [52]:
reg = LinearRegression()

# train
reg.fit(X_train_scaled, Y_train)

# predict
Y_pred = reg.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6931


##### Ridge regression addresses some of the problems of Ordinary Least Squares by imposing a penalty on the size of the coefficients.

In [53]:
alphas = np.logspace(-3, 4, 8);
evaluate_model(alphas, 'ridge')

alpha = 10000.0  with MSE: 0.6696


In [56]:
Ridge = GridSearchCV(Ridge(),
                       param_grid={"alpha": np.logspace(-3, 4, 8)})
# train
Ridge.fit(X_train_scaled, Y_train)

# predict
Y_pred = Ridge.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

plt.figure()
plt.plot(alphas, error_v, color='red', label='validation', marker = '.')
plt.plot(alphas, error_t, color='blue', label='training', marker = '.')
plt.title('MSE vs Alpha Value')
plt.xlabel('Alpha Value')
plt.ylabel('MSE')
plt.legend(loc='best')
plt.show()


MSE: 0.6696


NameError: name 'error_v' is not defined

<Figure size 432x288 with 0 Axes>

###### The Lasso is a linear model that estimates sparse coefficients.

In [19]:
lasso = GridSearchCV(Lasso(),
                       param_grid={"alpha": np.logspace(0, 4, 5)})
# train
lasso.fit(X_train_scaled, Y_train)

# predict
Y_pred = lasso.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6723


###### ElasticNet is a linear regression model trained with both L1 and L2 norm regularization of the coefficients.

In [10]:
elastic = GridSearchCV(ElasticNet(),
                       param_grid={"alpha": np.logspace(-2, 3, 6)})
# train
elastic.fit(X_train_scaled, Y_train)

# predict
Y_pred = elastic.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6665


###### LassoLars is a lasso model implemented using the LARS algorithm

In [11]:
alphas = np.logspace(-2, 3, 6);
evaluate_model(alphas, 'lars')

alpha = 0.01  with MSE: 0.6718


In [26]:
LassoLars = GridSearchCV(LassoLars(),
                       param_grid={"alpha": np.logspace(-2, 3, 6)})
# train
LassoLars.fit(X_train_scaled, Y_train)

# predict
Y_pred = LassoLars.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6723


###### Generalized Linear Regression, assuming Poisson distribution

In [12]:
alphas = np.logspace(-2, 3, 6);
evaluate_model(alphas, 'poisson')

alpha = 100.0  with MSE: 0.6715


In [27]:
PoissonRegressor = GridSearchCV(PoissonRegressor(),
                       param_grid={"alpha": np.logspace(-2, 3, 6)})
# train
PoissonRegressor.fit(X_train_scaled, Y_train)

# predict
Y_pred = PoissonRegressor.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6705


###### Generalized Linear Regression, assuming Gamma distribution

In [13]:
alphas = np.logspace(-2, 3, 6);
evaluate_model(alphas, 'gamma')


alpha = 100.0  with MSE: 0.6717


In [28]:
GammaRegressor = GridSearchCV(GammaRegressor(),
                       param_grid={"alpha": np.logspace(-2, 3, 6)})
# train
GammaRegressor.fit(X_train_scaled, Y_train)

# predict
Y_pred = GammaRegressor.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6723


###### Epsilon-Support Vector Regression

In [14]:
svr = GridSearchCV(SVR(kernel='rbf', gamma=0.1),
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "epsilon": np.logspace(-2, 2, 5)})
# train
svr.fit(X_train_scaled, Y_train)

# predict
Y_pred = svr.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6733


###### Nu Support Vector Regression

In [15]:
nusvr = GridSearchCV(NuSVR(kernel='rbf', gamma=0.1),
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "nu": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]})
# train
nusvr.fit(X_train_scaled, Y_train)

# predict
Y_pred = nusvr.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.6720


###### Stochastic Gradient Descent

In [16]:
sgd = GridSearchCV(SGDRegressor(max_iter=1000, tol=1e-3),
                   param_grid={"alpha": np.logspace(-5, 2, 8)})
# train
sgd.fit(X_train_scaled, Y_train)

# predict
Y_pred = sgd.predict(X_validation_scaled)

# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_validation, Y_pred))

MSE: 0.7006


###### Use test data for the best model found (ElasticNet)

In [20]:
reg = linear_model.ElasticNet(alpha=0.1)
reg.fit(X_train_scaled, Y_train)
Y_pred_test = reg.predict(X_test_scaled)
# The mean squared error
print('MSE: %.4f'% mean_squared_error(Y_test, Y_pred_test))

MSE: 0.6521
