In [14]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

seed = 42
np.random.seed(seed)

dataset = pd.read_csv('../data/student-por.csv', sep=';')
imp_features = dataset.drop(['school', 'sex', 'reason'], axis=1)

address_mapping = {"U":0.5, "R":1}
famsize_mapping = {"LE3":0.5,"GT3":1}
Pstatus_mapping = {"T":0.5,"A":1}
Mjob_mapping = {"teacher":0.2,"health":0.4,"services":0.6,"at_home":0.8,"other":1.0}
Fjob_mapping = {"teacher":0.2,"health":0.4,"services":0.6,"at_home":0.8,"other":1.0}
schoolsup_mapping = {"yes":0.5,"no":1}
famsup_mapping = {"yes":0.5,"no":1}
paid_mapping = {"yes":0.5,"no":1}
activities_mapping = {"yes":0.5,"no":1}
nursery_mapping = {"yes":0.5,"no":1}
higher_mapping = {"yes":0.5,"no":1}
internet_mapping = {"yes":0.5,"no":1}
romantic_mapping = {"yes":0.5,"no":1}

guardian_mapping = {"mother":0.33,"father":0.66,"other":1}
numeric_features = imp_features
numeric_features['address'] = imp_features['address'].map(address_mapping)
numeric_features['famsize'] = imp_features['famsize'].map(famsize_mapping)
numeric_features['Pstatus'] = imp_features['Pstatus'].map(Pstatus_mapping)
numeric_features['Mjob'] = imp_features['Mjob'].map(Mjob_mapping)
numeric_features['Fjob'] = imp_features['Fjob'].map(Fjob_mapping)
numeric_features['schoolsup'] = imp_features['schoolsup'].map(schoolsup_mapping)
numeric_features['famsup'] = imp_features['famsup'].map(famsup_mapping)
numeric_features['paid'] = imp_features['paid'].map(paid_mapping)
numeric_features['activities'] = imp_features['activities'].map(activities_mapping)
numeric_features['nursery'] = imp_features['nursery'].map(nursery_mapping)
numeric_features['higher'] = imp_features['higher'].map(higher_mapping)
numeric_features['internet'] = imp_features['internet'].map(internet_mapping)
numeric_features['romantic'] = imp_features['romantic'].map(romantic_mapping)
numeric_features['guardian'] = imp_features['guardian'].map(guardian_mapping)

data_np_array = numeric_features.values

X = data_np_array[:, :-1]
Y = data_np_array[:,-1]

X_trval, X_test, y_trval, Y_test = train_test_split(X, Y, test_size=0.1, random_state=seed)
X_train, X_val, Y_train, y_val = train_test_split(X_trval, y_trval, test_size=0.11, random_state=seed)

# X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [15]:
#SVR
from sklearn import svm
clf = svm.SVR()
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

86.52094011118521
0.8185852870884354




from sklearn import linear_model
clf = linear_model.SGDRegressor(max_iter=1000, tol=1e-3)
clf.fit(X_train, Y_train)
print(clf.score(X_test,Y_test)*100)

In [16]:
from sklearn.neighbors import KNeighborsRegressor
clf = KNeighborsRegressor(n_neighbors=5)
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

86.28636845926174
0.8338461538461538


In [17]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RBF, Matern, DotProduct, WhiteKernel)


kernels = [1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-1, 10.0)), 1.0 * Matern(length_scale=1.0, length_scale_bounds=(1e-1, 10.0), nu=1.5),
DotProduct() + WhiteKernel()]
for kernel in kernels:
    gpr = GaussianProcessRegressor(kernel=kernel, random_state=seed).fit(X_train, Y_train)
    gpr.fit(X_train, Y_train)
    pred=gpr.predict(X_test)
    print(gpr.score(X_test,Y_test)*100)
    print(mean_absolute_error(Y_test, pred))

84.11430984112113
0.9754425457727259
89.08625814552649
0.761383885329988
91.21968979130169
0.6743210504310985


In [18]:
from sklearn.ensemble import RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=300, random_state=seed)
random_forest.fit(X_train, Y_train)
pred=random_forest.predict(X_test)
print(random_forest.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

90.98504774669124
0.6818461538461538


In [19]:
from sklearn.tree import DecisionTreeRegressor
clf = DecisionTreeRegressor(random_state=seed)
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

82.39515273356788
0.9384615384615385


In [20]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

90.97811649078139
0.6857373514828898


In [21]:
from sklearn.neural_network import MLPRegressor
clf = MLPRegressor(hidden_layer_sizes=(20,), max_iter=1000, random_state=seed)
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

91.16548298781412
0.682962762302918


In [22]:
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0,random_state=seed)
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

85.4027172069433
0.8580844328701303


In [23]:
from sklearn.ensemble import ExtraTreesRegressor
clf = ExtraTreesRegressor(random_state=seed)
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

88.58951806556095
0.7553846153846155




In [24]:
from sklearn.ensemble import AdaBoostRegressor
clf = ExtraTreesRegressor(random_state=seed)
clf.fit(X_train, Y_train)
pred=clf.predict(X_test)
print(clf.score(X_test,Y_test)*100)
print(mean_absolute_error(Y_test, pred))

88.58951806556095
0.7553846153846155


