In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [3]:
df_por = pd.read_csv('student-por.csv',sep=';')
df_math = pd.read_csv('student-mat.csv',sep=',')


In [4]:
print('Structure of portuguese data is: ',df_por.shape)
print('Structure of math data is: ',df_math.shape)

Structure of portuguese data is:  (649, 33)
Structure of math data is:  (395, 33)


In [5]:
df_math.corr(numeric_only=True)['G3'].sort_values(ascending=False)

G3            1.000000
G2            0.904868
G1            0.801468
Medu          0.217147
Fedu          0.152457
studytime     0.097820
famrel        0.051363
absences      0.034247
freetime      0.011307
Walc         -0.051939
Dalc         -0.054660
health       -0.061335
traveltime   -0.117142
goout        -0.132791
age          -0.161579
failures     -0.360415
Name: G3, dtype: float64

In [6]:
X_train = df_por.drop('G3',axis=1)
y_train = df_por['G3']
x_test = df_math.drop('G3',axis=1)
y_test = df_math['G3']

In [7]:
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [8]:
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_cols)
])

In [8]:
pipe = Pipeline(steps=[('preprocessor',preprocessor),('model',XGBRegressor(n_estimators=300,learning_rate=0.1,max_depth=4,random_state=42))])


In [9]:
pipe.fit(X_train,y_train)
y_pred = pipe.predict(x_test)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,y_pred)
r2_train = pipe.score(X_train,y_train)
r2_test = pipe.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

Mean Absolute Error:  1.295174241065979
Root Mean Squared Error:  2.1654512101716894
R2-score for training data:  0.9924877285957336
R2-score for testing data:  0.7760282754898071


In [None]:
pipe2 = Pipeline(steps=[('preprocessor',preprocessor),('model',RandomForestRegressor(n_estimators=300,max_depth=4,random_state=42))])

In [11]:
pipe2.fit(X_train,y_train)
y_pred2 = pipe2.predict(x_test)
mse = mean_squared_error(y_test,y_pred2)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,y_pred2)
r2_train = pipe2.score(X_train,y_train)
r2_test = pipe2.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

Mean Absolute Error:  1.1800029506999272
Root Mean Squared Error:  2.001951905615434
R2-score for training data:  0.891352807604267
R2-score for testing data:  0.8085727977839489


In [17]:
from sklearn.svm import SVR

In [13]:
pipe3 = Pipeline(steps=[('preprocessor',preprocessor),('model',SVR(kernel='rbf',C=10,epsilon=0.1,gamma='scale'))])

In [14]:
pipe3.fit(X_train,y_train)
y_pred3 = pipe3.predict(x_test)
mse = mean_squared_error(y_test,y_pred3)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,y_pred3)
r2_train = pipe3.score(X_train,y_train)
r2_test = pipe3.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

Mean Absolute Error:  1.5582031010800492
Root Mean Squared Error:  2.566297758767793
R2-score for training data:  0.973457974960195
R2-score for testing data:  0.6854349549034273


In [15]:
pipe4 = Pipeline(steps=[('preprocessor',preprocessor),('model',LGBMRegressor(n_estimators=300,learning_rate=0.05,max_depth=4,random_state=42))])

In [16]:
pipe4.fit(X_train,y_train)
y_pred4 = pipe4.predict(x_test)
mse = mean_squared_error(y_test,y_pred4)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,y_pred4)
r2_train = pipe4.score(X_train,y_train)
r2_test = pipe4.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 177
[LightGBM] [Info] Number of data points in the train set: 649, number of used features: 41
[LightGBM] [Info] Start training from score 11.906009
Mean Absolute Error:  1.3041438398784455
Root Mean Squared Error:  2.0922176826887444
R2-score for training data:  0.95321506162561
R2-score for testing data:  0.7909211470578876




##### Apply Hyperparamter tuning

In [13]:
from sklearn.model_selection import GridSearchCV,StratifiedKFold

In [9]:
#XGboost algorithm with hyperparameter tuning
pipe_xgb = Pipeline(steps=[('preprocessor',preprocessor),('model',XGBRegressor())])
param_xgb = {'model__n_estimators':[100,150,200,300],'model__max_depth':[3,4,5,6],'model__learning_rate':np.linspace(0.02, 0.15, 10).tolist()}

In [10]:
print(X_train.shape)
print(x_test.shape)

(649, 32)
(395, 32)


In [27]:
xgb = GridSearchCV(pipe_xgb,param_grid=param_xgb)
xgb.fit(X_train,y_train)

In [31]:
xgb_pred = xgb.predict(x_test)
mse = mean_squared_error(y_test,xgb_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,xgb_pred)
r2_train = xgb.score(X_train,y_train)
r2_test = xgb.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

Mean Absolute Error:  1.213630199432373
Root Mean Squared Error:  2.03438197543447
R2-score for training data:  0.8917340636253357
R2-score for testing data:  0.8023205995559692


In [11]:
pipe_rf = Pipeline(steps=[('preprocessor',preprocessor),('model',RandomForestRegressor())])
param_rf = {'model__n_estimators':[100,150,200,300],'model__max_features':["sqrt", "log2", 0.6, 0.8, 1.0],'model__max_depth':[3,4,5,6]}


In [14]:
rf = GridSearchCV(pipe_rf,param_grid=param_rf)
rf.fit(X_train,y_train)

In [15]:
rf_pred = rf.predict(x_test)
mse = mean_squared_error(y_test,rf_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,rf_pred)
r2_train = rf.score(X_train,y_train)
r2_test = rf.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

Mean Absolute Error:  1.2013591328564852
Root Mean Squared Error:  2.0209437291337196
R2-score for training data:  0.8934469040151798
R2-score for testing data:  0.804923563014916


In [18]:
pipe_svr = Pipeline(steps=[('preprocessor',preprocessor),('model',SVR())])
param_svr = {"model__C": np.logspace(-2, 3, 12).tolist(),
    "model__gamma": np.logspace(-4, 0, 10).tolist(),
    "model__epsilon": np.linspace(0.05, 1.0, 10).tolist()}

In [19]:
svr = GridSearchCV(pipe_svr,param_grid=param_svr)
svr.fit(X_train,y_train)

In [None]:
svr_pred = svr.predict(x_test)
mse = mean_squared_error(y_test,svr_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,svr_pred)
r2_train = svr.score(X_train,y_train)
r2_test = svr.score(x_test,y_test)
print('Mean Absolute Error: ',mae)
print('Root Mean Squared Error: ',rmse)
print('R2-score for training data: ',r2_train)
print('R2-score for testing data: ',r2_test)

Mean Absolute Error:  1.171301942529404
Root Mean Squared Error:  2.139536913402301
R2-score for training data:  0.846030818776712
R2-score for testing data:  0.7813568177171779
