In [1]:
#imports
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn import tree

from sklearn.tree import export_graphviz 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
# output of plotting commands is displayed inline, directly below the code cell that produced it
%matplotlib inline 

In [2]:
df = pd.read_csv("Math(1).csv")

In [3]:
df_new = df.copy()

In [4]:
df_new.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [5]:
# Cleaning the dataset
df_new['sex'] = df_new['sex'].map({'F':0, 'M':1}).astype(int)
df_new['school'] = df_new['school'].map({'GP':1, 'MS':2}).astype(int)
df_new['address'] = df_new['address'].map({'R':1, 'U':2}).astype(int)
df_new['famsize'] = df_new['famsize'].map({'LE3':1, 'GT3':2}).astype(int)
df_new['Pstatus'] = df_new['Pstatus'].map({'A':0, 'T':1}).astype(int)
df_new['Mjob'] = df_new['Mjob'].map({'teacher':1, 'health':2, 'services':3, 'at_home':4, 'other':5}).astype(int)
df_new['Fjob'] = df_new['Fjob'].map({'teacher':1, 'health':2, 'services':3, 'at_home':4, 'other':5}).astype(int)
df_new['reason'] = df_new['reason'].map({'home':1, 'reputation':2, 'course':3, 'other':4}).astype(int)
df_new['guardian'] = df_new['guardian'].map({'father':1, 'mother':2, 'other':2}).astype(int)
df_new['schoolsup'] = df_new['schoolsup'].map({'yes':1, 'no':2}).astype(int)
df_new['famsup'] = df_new['famsup'].map({'yes':1, 'no':2}).astype(int)
df_new['paid'] = df_new['paid'].map({'yes':1, 'no':2}).astype(int)
df_new['activities'] = df_new['activities'].map({'yes':1, 'no':2}).astype(int)
df_new['nursery'] = df_new['nursery'].map({'yes':1, 'no':2}).astype(int)
df_new['higher'] = df_new['higher'].map({'yes':1, 'no':2}).astype(int)
df_new['internet'] = df_new['internet'].map({'yes':1, 'no':2}).astype(int)
df_new['romantic'] = df_new['romantic'].map({'yes':1, 'no':2}).astype(int)

In [6]:
# Transform data
df_new.loc[df['G3'] <= 9, 'G3'] = 0
df_new.loc[(df['G3'] >= 10) & (df['G3']<=11),'G3'] = 1
df_new.loc[(df['G3'] >= 12) & (df['G3']<=13),'G3'] = 2
df_new.loc[(df['G3'] >= 14) & (df['G3']<=15),'G3'] = 3
df_new.loc[df['G3'] > 15, 'G3'] = 4

In [7]:
# Load Data and Split into train & test data
dfX = df_new.iloc[:,0:32]
dfy = df_new.G3
X = dfX.values
y = dfy.values
print("X shape is: {} and type is {}".format(X.shape,type(X)))
print("y shape is: {} and type is {}".format(y.shape,type(y)))

X shape is: (395, 32) and type is <class 'numpy.ndarray'>
y shape is: (395,) and type is <class 'numpy.ndarray'>


In [8]:
# Splitting data into train and test data sets
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=2)

In [9]:
# Create Decision Tree Model
tree_reg = tree.DecisionTreeRegressor(max_depth=2, random_state=2)
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=2, splitter='best')

In [10]:
#training mse
train_mse = mean_squared_error(tree_reg.predict(X_train), y_train)
print(mean_absolute_error(tree_reg.predict(X_train), y_train))
print('the training mean squared error is: ', train_mse)

0.3236831564162473
the training mean squared error is:  0.23222009955962816


In [11]:
#testing mse
test_mse = mean_squared_error(tree_reg.predict(X_test), y_test)
print(mean_absolute_error(tree_reg.predict(X_test), y_test))
print('the testing mean squared error is: ',test_mse)

0.3493045013921644
the testing mean squared error is:  0.24989710889381708


In [12]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.8726203684754359
testing R^2 value is:  0.8598231299113507


In [13]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(tree_reg, X, y.ravel(), scoring='neg_mean_squared_error', cv=20, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [-0.24527831 -0.22800284 -0.23467448 -0.23308611 -0.24629751 -0.24732367
 -0.24631485 -0.24362566 -0.24072922 -0.23014944 -0.24533855 -0.24341694
 -0.24290334 -0.24112396 -0.23937048 -0.23429241 -0.24789581 -0.23975749
 -0.24384569 -0.24709519]
test_score:  [-0.19494533 -0.3726988  -0.39616172 -0.42174738 -0.17450887 -0.16132283
 -0.17422618 -0.22295466 -0.27964463 -0.32891721 -0.19014437 -0.22750249
 -0.24415007 -0.27053545 -0.3083675  -0.40758806 -0.13799995 -0.30444628
 -0.21754821 -0.15328856]


In [14]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

cross val training mean_squared_error is: 0.24102609818983473
cross val testing mean_squared_error is: 0.25943492841488514


In [15]:
# Use GridSearch to find the best combination of model hyperparameters

criteriaList = np.arange(2,21)

param_grid = { "criterion" : ["mse", "mae"], "min_samples_leaf" : criteriaList, "min_samples_split" : criteriaList, 'max_depth' : criteriaList}

gs = GridSearchCV(tree_reg,param_grid=param_grid, scoring="neg_mean_squared_error",cv=20,n_jobs=-1)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)



-0.1882348020786646
{'criterion': 'mse', 'max_depth': 3, 'min_samples_leaf': 17, 'min_samples_split': 2}


In [16]:
# Create the final Decision Tree Regressor using the best hyperparameters
tree_reg = tree.DecisionTreeRegressor(criterion = 'mse', max_depth = 3, min_samples_leaf = 17, min_samples_split = 2)
tree_reg.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=17,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
#Evaluate the Model by calculating training mse and testing mse
train_mse=mean_squared_error(tree_reg.predict(X_train),y_train)
print("The training mean squared error is: ",train_mse)
test_mse=mean_squared_error(tree_reg.predict(X_test),y_test)
print("The testing mean squared error is: ",test_mse)

The training mean squared error is:  0.17466599481518108
The testing mean squared error is:  0.1800623083775602


In [18]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.9041905067579379
testing R^2 value is:  0.8989961471702002


In [19]:
pd.concat((pd.DataFrame(df.iloc[:, 0:32].columns, columns = ['variable']), 
           pd.DataFrame(tree_reg.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
31,G2,0.996561
30,G1,0.003439
1,sex,0.0
29,absences,0.0
28,health,0.0
27,Walc,0.0
26,Dalc,0.0
25,goout,0.0
24,freetime,0.0
23,famrel,0.0


In [20]:
dfX_final = df_new[["G2", "G1"]]
dfy_final = df_new.G3
X_final = dfX_final.values
y_final = dfy_final.values

In [21]:
# Splitting data into train and test data sets
X_train,X_test,y_train,y_test=train_test_split(X_final, y_final, test_size=0.2, random_state=2)

In [22]:
# Create Decision Tree Model
tree_reg_final = tree.DecisionTreeRegressor(max_depth=2, random_state=2)
tree_reg_final.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=2, splitter='best')

In [23]:
#training mse
train_mse = mean_squared_error(tree_reg_final.predict(X_train), y_train)
print('the training mean squared error is: ', train_mse)

the training mean squared error is:  0.23222009955962816


In [24]:
#testing mse
test_mse = mean_squared_error(tree_reg_final.predict(X_test), y_test)
print('the testing mean squared error is: ',test_mse)

the testing mean squared error is:  0.24989710889381708


In [25]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg_final.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg_final.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.8726203684754359
testing R^2 value is:  0.8598231299113507


In [26]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(tree_reg_final, X_final, y_final.ravel(), scoring='neg_mean_squared_error', cv=20, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [-0.24527831 -0.22800284 -0.23467448 -0.23308611 -0.24629751 -0.24732367
 -0.24631485 -0.24362566 -0.24072922 -0.23014944 -0.24533855 -0.24341694
 -0.24290334 -0.24112396 -0.23937048 -0.23429241 -0.24789581 -0.23975749
 -0.24384569 -0.24709519]
test_score:  [-0.19494533 -0.3726988  -0.39616172 -0.42174738 -0.17450887 -0.16132283
 -0.17422618 -0.22295466 -0.27964463 -0.32891721 -0.19014437 -0.22750249
 -0.24415007 -0.27053545 -0.3083675  -0.40758806 -0.13799995 -0.30444628
 -0.21754821 -0.15328856]


In [27]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

cross val training mean_squared_error is: 0.24102609818983473
cross val testing mean_squared_error is: 0.25943492841488514


In [28]:
# Use GridSearch to find the best combination of model hyperparameters

criteriaList = np.arange(2,21)

param_grid = { "criterion" : ["mse", "mae"], "min_samples_leaf" : criteriaList, "min_samples_split" : criteriaList, 'max_depth' : criteriaList}

gs = GridSearchCV(tree_reg_final,param_grid=param_grid, scoring="neg_mean_squared_error",cv=20,n_jobs=-1)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)



-0.1762995917380907
{'criterion': 'mse', 'max_depth': 4, 'min_samples_leaf': 8, 'min_samples_split': 20}


In [29]:
# Create the final Decision Tree Regressor using the best hyperparameters
tree_reg_final = tree.DecisionTreeRegressor(criterion = 'mse', max_depth = 4, min_samples_leaf = 8, min_samples_split = 20)
tree_reg_final.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=8,
           min_samples_split=20, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [40]:
#Evaluate the Model by calculating training mse and testing mse
train_mse=mean_squared_error(tree_reg_final.predict(X_train),y_train)
print("The training mean squared error is: ",train_mse)
test_mse=mean_squared_error(tree_reg_final.predict(X_test),y_test)
print("The testing mean squared error is: ",test_mse)

The training mean squared error is:  0.15481096507048542
The testing mean squared error is:  0.1863943081977777


In [31]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg_final.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg_final.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.9150815811205132
testing R^2 value is:  0.8954442856855716


In [32]:
results  = cross_validate(tree_reg_final, X_final, y_final.ravel(), scoring='neg_mean_squared_error', cv=20, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [-0.1626837  -0.15066057 -0.15716867 -0.15272219 -0.16596815 -0.16038707
 -0.16540564 -0.15914265 -0.15632326 -0.15678953 -0.16407576 -0.15837523
 -0.16413472 -0.16250685 -0.15462506 -0.15791396 -0.16394052 -0.15703916
 -0.15917563 -0.16486951]
test_score:  [-0.107438   -0.34196165 -0.21375054 -0.29377614 -0.0826195  -0.19495771
 -0.09503577 -0.22908471 -0.27537388 -0.21649057 -0.11458268 -0.22843905
 -0.12256128 -0.14249427 -0.31433577 -0.25987467 -0.12084847 -0.26092828
 -0.17931265 -0.09411926]


In [33]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

cross val training mean_squared_error is: 0.15969539113923664
cross val testing mean_squared_error is: 0.19439924301054406
