In [1]:
#imports
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  

from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn import tree

from sklearn.tree import export_graphviz 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# output of plotting commands is displayed inline, directly below the code cell that produced it
%matplotlib inline 

In [2]:
df = pd.read_csv("Math(1).csv")

In [3]:
df_new = df.copy()

In [4]:
df_new.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [5]:
# Cleaning the dataset
df_new['sex'] = df_new['sex'].map({'F':0, 'M':1}).astype(int)
df_new['school'] = df_new['school'].map({'GP':1, 'MS':2}).astype(int)
df_new['address'] = df_new['address'].map({'R':1, 'U':2}).astype(int)
df_new['famsize'] = df_new['famsize'].map({'LE3':1, 'GT3':2}).astype(int)
df_new['Pstatus'] = df_new['Pstatus'].map({'A':0, 'T':1}).astype(int)
df_new['Mjob'] = df_new['Mjob'].map({'teacher':1, 'health':2, 'services':3, 'at_home':4, 'other':5}).astype(int)
df_new['Fjob'] = df_new['Fjob'].map({'teacher':1, 'health':2, 'services':3, 'at_home':4, 'other':5}).astype(int)
df_new['reason'] = df_new['reason'].map({'home':1, 'reputation':2, 'course':3, 'other':4}).astype(int)
df_new['guardian'] = df_new['guardian'].map({'father':1, 'mother':2, 'other':2}).astype(int)
df_new['schoolsup'] = df_new['schoolsup'].map({'yes':1, 'no':2}).astype(int)
df_new['famsup'] = df_new['famsup'].map({'yes':1, 'no':2}).astype(int)
df_new['paid'] = df_new['paid'].map({'yes':1, 'no':2}).astype(int)
df_new['activities'] = df_new['activities'].map({'yes':1, 'no':2}).astype(int)
df_new['nursery'] = df_new['nursery'].map({'yes':1, 'no':2}).astype(int)
df_new['higher'] = df_new['higher'].map({'yes':1, 'no':2}).astype(int)
df_new['internet'] = df_new['internet'].map({'yes':1, 'no':2}).astype(int)
df_new['romantic'] = df_new['romantic'].map({'yes':1, 'no':2}).astype(int)

In [6]:
# Transform data
df_new.loc[df['G3'] <= 9, 'G3'] = 0
df_new.loc[(df['G3'] >= 10) & (df['G3']<=11),'G3'] = 1
df_new.loc[(df['G3'] >= 12) & (df['G3']<=13),'G3'] = 2
df_new.loc[(df['G3'] >= 14) & (df['G3']<=15),'G3'] = 3
df_new.loc[df['G3'] > 15, 'G3'] = 4

In [7]:
# Load Data and Split into train & test data
dfX = df_new.iloc[:,0:30]
dfy = df_new.G3
X = dfX.values
y = dfy.values
print("X shape is: {} and type is {}".format(X.shape,type(X)))
print("y shape is: {} and type is {}".format(y.shape,type(y)))

X shape is: (395, 30) and type is <class 'numpy.ndarray'>
y shape is: (395,) and type is <class 'numpy.ndarray'>


In [8]:
# Splitting data into train and test data sets
X_train,X_test,y_train,y_test=train_test_split(X, y, test_size=0.2, random_state=2)

In [9]:
# Create Decision Tree Model
tree_reg = tree.DecisionTreeRegressor(max_depth=2, random_state=2)
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=2, splitter='best')

In [10]:
#training mse
train_mse = mean_squared_error(tree_reg.predict(X_train), y_train)
print('the training mean squared error is: ', train_mse)

the training mean squared error is:  1.5346497683803728


In [11]:
#testing mse
test_mse = mean_squared_error(tree_reg.predict(X_test), y_test)
print('the testing mean squared error is: ',test_mse)

the testing mean squared error is:  1.6070618654533675


In [12]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.15819895699702535
testing R^2 value is:  0.09853738070335549


In [13]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(tree_reg, X, y.ravel(), scoring='neg_mean_squared_error', cv=20, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [-1.54432317 -1.53060749 -1.56027548 -1.55199353 -1.53388262 -1.49855767
 -1.54061944 -1.57704515 -1.5367022  -1.50468982 -1.54251755 -1.55469279
 -1.55987622 -1.5219821  -1.52814443 -1.54416424 -1.51694911 -1.54933464
 -1.55860041 -1.53001079]
test_score:  [-1.50716968 -1.7871996  -1.22147099 -1.7591524  -1.71068878 -2.42388607
 -1.57482814 -1.08136043 -1.665731   -2.38043287 -1.58048102 -1.32872927
 -1.21635774 -1.93435689 -1.80767941 -1.52203484 -2.04891014 -1.41871068
 -1.22760648 -1.82462674]


In [14]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

cross val training mean_squared_error is: 1.539248442712533
cross val testing mean_squared_error is: 1.6510706573481801


In [15]:
# Use GridSearch to find the best combination of model hyperparameters

criteriaList = np.arange(2,21)

param_grid = { "criterion" : ["mse", "mae"], "min_samples_leaf" : criteriaList, "min_samples_split" : criteriaList, 'max_depth' : criteriaList}

gs = GridSearchCV(tree_reg,param_grid=param_grid, scoring="neg_mean_squared_error",cv=20,n_jobs=-1)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)



-1.707158685133505
{'criterion': 'mse', 'max_depth': 2, 'min_samples_leaf': 19, 'min_samples_split': 2}


In [16]:
# Create the final Decision Tree Regressor using the best hyperparameters
tree_reg = tree.DecisionTreeRegressor(criterion = 'mse', max_depth = 2, min_samples_leaf = 19, min_samples_split = 2)
tree_reg.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=19,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
#Evaluate the Model by calculating training mse and testing mse
train_mse=mean_squared_error(tree_reg.predict(X_train),y_train)
print("The training mean squared error is: ",train_mse)
test_mse=mean_squared_error(tree_reg.predict(X_test),y_test)
print("The testing mean squared error is: ",test_mse)

The training mean squared error is:  1.5346497683803728
The testing mean squared error is:  1.6070618654533675


In [18]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.15819895699702535
testing R^2 value is:  0.09853738070335549


In [19]:
pd.concat((pd.DataFrame(df.iloc[:, 0:32].columns, columns = ['variable']), 
           pd.DataFrame(tree_reg.feature_importances_, columns = ['importance'])), 
          axis = 1).sort_values(by='importance', ascending = False)

Unnamed: 0,variable,importance
14,failures,0.703117
15,schoolsup,0.25725
24,freetime,0.039633
16,famsup,0.0
28,health,0.0
27,Walc,0.0
26,Dalc,0.0
25,goout,0.0
23,famrel,0.0
22,romantic,0.0


In [20]:
dfX_final = df_new[["failures", "schoolsup", "freetime"]]
dfy_final = df_new.G3
X_final = dfX_final.values
y_final = dfy_final.values

In [21]:
# Splitting data into train and test data sets
X_train,X_test,y_train,y_test=train_test_split(X_final, y_final, test_size=0.2, random_state=2)

In [22]:
# Create Decision Tree Model
tree_reg_final = tree.DecisionTreeRegressor(max_depth=2, random_state=2)
tree_reg_final.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=2, splitter='best')

In [23]:
#training mse
train_mse = mean_squared_error(tree_reg_final.predict(X_train), y_train)
print('the training mean squared error is: ', train_mse)

the training mean squared error is:  1.5346497683803728


In [24]:
#testing mse
test_mse = mean_squared_error(tree_reg_final.predict(X_test), y_test)
print('the testing mean squared error is: ',test_mse)

the testing mean squared error is:  1.6070618654533675


In [25]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg_final.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg_final.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.15819895699702535
testing R^2 value is:  0.09853738070335549


In [26]:
# Evaluate the Model using K-folds Cross validation
results  = cross_validate(tree_reg_final, X_final, y_final.ravel(), scoring='neg_mean_squared_error', cv=20, return_train_score = True)
print('train_score: ', results['train_score'])
print('test_score: ', results['test_score'])

train_score:  [-1.54536289 -1.53282361 -1.56093155 -1.57314325 -1.53688034 -1.50099704
 -1.54696887 -1.57755497 -1.53707405 -1.50468982 -1.54667989 -1.55469279
 -1.56003807 -1.52413291 -1.52920341 -1.54498149 -1.51797284 -1.54933464
 -1.56287677 -1.53001079]
test_score:  [-1.50362399 -1.76121506 -1.2256318  -0.98933403 -1.6593651  -2.3930897
 -1.47246283 -0.905326   -1.66451995 -2.38043287 -1.49388159 -1.32872927
 -1.23174663 -1.90855915 -1.80391393 -1.51592157 -2.04558779 -1.41871068
 -1.14539931 -1.82462674]


In [27]:
print('cross val training mean_squared_error is:', sum(-results['train_score'])/len(results['train_score']))
print('cross val testing mean_squared_error is:', sum(-results['test_score'])/len(results['test_score']))

cross val training mean_squared_error is: 1.5418174997521348
cross val testing mean_squared_error is: 1.583603898615764


In [28]:
# Use GridSearch to find the best combination of model hyperparameters

criteriaList = np.arange(2,21)

param_grid = { "criterion" : ["mse", "mae"], "min_samples_leaf" : criteriaList, "min_samples_split" : criteriaList, 'max_depth' : criteriaList}

gs = GridSearchCV(tree_reg_final,param_grid=param_grid, scoring="neg_mean_squared_error",cv=20,n_jobs=-1)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

-1.540764135976031
{'criterion': 'mse', 'max_depth': 4, 'min_samples_leaf': 11, 'min_samples_split': 2}




In [29]:
# Create the final Decision Tree Regressor using the best hyperparameters
tree_reg_final = tree.DecisionTreeRegressor(criterion = 'mse', max_depth = 4, min_samples_leaf = 11, min_samples_split = 2)
tree_reg_final.fit(X_train,y_train)

DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=11,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [31]:
#Evaluate the Model by calculating training mse and testing mse
train_mse=mean_squared_error(tree_reg_final.predict(X_train),y_train)
print("The training mean squared error is: ",train_mse)
test_mse=mean_squared_error(tree_reg_final.predict(X_test),y_test)
print("The testing mean squared error is: ",test_mse)

The training mean squared error is:  1.4749847095383646
The testing mean squared error is:  1.5690635376552415


In [33]:
#the score function for regressor is calculating R Squared Value of the regression model
# R Squared value rangs from 0 to 1 and the higher the better
train_R2 = tree_reg_final.score(X_train, y_train)
print('training R^2 value is: ', train_R2)

test_R2 = tree_reg_final.score(X_test, y_test)
print('testing R^2 value is: ', test_R2)

training R^2 value is:  0.19092701638808995
testing R^2 value is:  0.11985209972080146
