## Model Trainig

In [2]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline

#ml libraries
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.linear_model import LogisticRegression,Ridge, Lasso
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score,mean_absolute_error,mean_squared_error,r2_score

#boosting libraries
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [3]:
#read data
data=pd.read_csv("data/StudentsPerformance.csv")
data.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93


In [4]:
data['total_score'] = data['math score'] + data['reading score'] + data['writing score']
data['average'] = data['total_score']/3
data.head(3)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,total_score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667


In [5]:
#rename columns to short names
data.rename(columns = {'race/ethnicity':'race_ethnicity','parental level of education':'parental_level_of_education','test preparation course':'test_preparation_course'}, inplace = True)
data.head(3)

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math score,reading score,writing score,total_score,average
0,female,group B,bachelor's degree,standard,none,72,72,74,218,72.666667
1,female,group C,some college,standard,completed,69,90,88,247,82.333333
2,female,group B,master's degree,standard,none,90,95,93,278,92.666667


In [6]:
#target and features
x=data.drop('average',axis=1)
y=data['average']

In [7]:
data.nunique()

gender                           2
race_ethnicity                   5
parental_level_of_education      6
lunch                            2
test_preparation_course          2
math score                      81
reading score                   72
writing score                   77
total_score                    194
average                        194
dtype: int64

In [8]:
data.dtypes

gender                          object
race_ethnicity                  object
parental_level_of_education     object
lunch                           object
test_preparation_course         object
math score                       int64
reading score                    int64
writing score                    int64
total_score                      int64
average                        float64
dtype: object

In [9]:
#lable for numerical data and one hot for categorical data
numeric_data =x.select_dtypes(exclude='object').columns
cat_data =x.select_dtypes(include='object').columns


from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

sc = StandardScaler()
ohe = OneHotEncoder()


preprosessing = ColumnTransformer(
    [
        ('stdcaler', sc, numeric_data),
        ('onehot', ohe, cat_data)
    ])

In [10]:
#fit and transform
x = preprosessing.fit_transform(x)

In [11]:
x

array([[ 0.39002351,  0.19399858,  0.39149181, ...,  1.        ,
         0.        ,  1.        ],
       [ 0.19207553,  1.42747598,  1.31326868, ...,  1.        ,
         1.        ,  0.        ],
       [ 1.57771141,  1.77010859,  1.64247471, ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [-0.46775108,  0.12547206, -0.20107904, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.12609287,  0.60515772,  0.58901542, ...,  1.        ,
         1.        ,  0.        ],
       [ 0.71993682,  1.15336989,  1.18158627, ...,  0.        ,
         0.        ,  1.        ]])

In [12]:
#split data
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.8,random_state=42)

x_train.shape,x_test.shape,y_train.shape,y_test.shape

((800, 21), (200, 21), (800,), (200,))

In [13]:
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
#train model
models= {
        "LinearRegression":LinearRegression(),
        "KNeighborsRegressor":KNeighborsRegressor(),
        "RandomForestRegressor":RandomForestRegressor(),
        "AdaBoostClassifier" : AdaBoostRegressor(),
        "XGBRegressor":XGBRegressor(),
        "LGBMRegressor":LGBMRegressor(),
        "CatBoostRegressor":CatBoostRegressor(),
        "Ridge":Ridge(),
        "Lasso":Lasso(),
        "SVR":SVR(),
        "DecisionTreeRegressor":DecisionTreeRegressor(),    
}


In [None]:
#train model
for name,model in models.items():
    model.fit(x_train,y_train)
    print(name,'trained')
    

In [15]:
models.keys()

dict_keys(['LinearRegression', 'KNeighborsRegressor', 'RandomForestRegressor', 'AdaBoostClassifier', 'XGBRegressor', 'LGBMRegressor', 'CatBoostRegressor', 'Ridge', 'Lasso', 'SVR', 'DecisionTreeRegressor'])

In [16]:
#make predictions
y_pred={}
for name,model in models.items():
    y_pred[name]=model.predict(x_test)

In [17]:
#evaluate model all metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

for name,model in models.items():
    print(name)
    print('MAE:',mean_absolute_error(y_test,y_pred[name]))
    print('MSE:',mean_squared_error(y_test,y_pred[name]))
    print('RMSE:',np.sqrt(mean_squared_error(y_test,y_pred[name])))
    print('R2:',r2_score(y_test,y_pred[name]))
    print('-------------------------------------------')

LinearRegression
MAE: 1.2878587085651816e-14
MSE: 2.6600389724052517e-28
RMSE: 1.630962590743654e-14
R2: 1.0
-------------------------------------------
KNeighborsRegressor
MAE: 1.923
MSE: 7.108022222222221
RMSE: 2.666087437092456
R2: 0.9668417406093228
-------------------------------------------
RandomForestRegressor
MAE: 0.11596666666666584
MSE: 0.7676997777777753
RMSE: 0.8761847851782039
R2: 0.9964187522815928
-------------------------------------------
AdaBoostClassifier
MAE: 0.8613769925539478
MSE: 1.707542688196558
RMSE: 1.3067297686195711
R2: 0.9920344729369495
-------------------------------------------
XGBRegressor
MAE: 0.17863410313924127
MSE: 0.5638028925861522
RMSE: 0.7508680926675152
R2: 0.9973699121959496
-------------------------------------------
LGBMRegressor
MAE: 0.39697710148474924
MSE: 3.750474785220587
RMSE: 1.9366142582405477
R2: 0.9825043856253366
-------------------------------------------
CatBoostRegressor
MAE: 0.3137564782584298
MSE: 1.0801078905712282
RMSE: 1

In [18]:
model_list=list(y_pred.keys())
model_list

['LinearRegression',
 'KNeighborsRegressor',
 'RandomForestRegressor',
 'AdaBoostClassifier',
 'XGBRegressor',
 'LGBMRegressor',
 'CatBoostRegressor',
 'Ridge',
 'Lasso',
 'SVR',
 'DecisionTreeRegressor']

In [19]:
#r2 list
r2_list=[r2_score(y_test,y_pred[name]) for name in model_list]
r2_list

[1.0,
 0.9668417406093228,
 0.9964187522815928,
 0.9920344729369495,
 0.9973699121959496,
 0.9825043856253366,
 0.9949613976313257,
 0.9999998064764087,
 0.9948006919534692,
 0.862219155062153,
 0.9975949815142712]

In [20]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model_name','r2 score']).sort_values(by='r2 score',ascending=False)

Unnamed: 0,model_name,r2 score
0,LinearRegression,1.0
7,Ridge,1.0
10,DecisionTreeRegressor,0.997595
4,XGBRegressor,0.99737
2,RandomForestRegressor,0.996419
6,CatBoostRegressor,0.994961
8,Lasso,0.994801
3,AdaBoostClassifier,0.992034
5,LGBMRegressor,0.982504
1,KNeighborsRegressor,0.966842
