## Student Performance Indicator

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [16]:
df=pd.read_csv('./data/stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [None]:
x=df.drop('math_score',axis=1)
y=df['math_score']

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,74
1,female,group C,some college,standard,completed,90,88
2,female,group B,master's degree,standard,none,95,93
3,male,group A,associate's degree,free/reduced,none,57,44
4,male,group C,some college,standard,none,78,75


In [20]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_features=x.select_dtypes(exclude='object').columns
cat_features=x.select_dtypes(include='object').columns

preproces=ColumnTransformer([
    ("num_features",StandardScaler(),num_features),
    ('cat_features',OneHotEncoder(),cat_features)
],remainder='passthrough')

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [22]:
preproces

0,1,2
,transformers,"[('num_features', ...), ('cat_features', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [27]:
x_train=preproces.fit_transform(x_train)
x_test=preproces.transform(x_test)
pd.DataFrame(x_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.137866,1.031032,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-0.245978,0.301391,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.238367,0.235060,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.515136,0.301391,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0.930290,0.633046,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,-0.868708,-0.826235,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
196,-0.937901,-1.025228,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
197,0.791905,0.500384,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
198,0.515136,0.367722,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [35]:
def evaluate_model(true,pred):
    mse=mean_squared_error(true,pred)
    mae=mean_absolute_error(true,pred)
    r2=r2_score(true,pred)
    rmse=np.sqrt(mse)
    return mae,rmse,r2

In [37]:
models={
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'RandomForestReg':RandomForestRegressor(),
    'LinearRegression':LinearRegression(),
    'AdaBoostRegressor':AdaBoostRegressor(),
    'DecisionTreeRegressor':DecisionTreeRegressor(),
    'SVR':SVR(),
    'K nearest':KNeighborsRegressor(),
    'catboost':CatBoostRegressor(),
    'Xgboost':XGBRegressor()
}

model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(x_train,y_train)
    
    y_pred_train=model.predict(x_train)
    y_pred_test=model.predict(x_test)
    
     #train accuracy
    train_mae,train_rmse,train_r2score=evaluate_model(y_train,y_pred_train)
    
    # test accuracy
    test_mae,test_rmse,test_r2score=evaluate_model(y_test,y_pred_test)
    
    print(list(models.keys())[i])
    print("----------------------------------------")
    print('model train accuracy')
    print('mae:',train_mae)
    print("rmse",train_rmse)
    print('r2 score',train_r2score)
    
    print('model test accuracy ')
    print('mae:',test_mae)
    print("rmse",test_rmse)
    print('r2 score',test_r2score)
    model_list.append(list(models.keys())[i])
    r2_list.append(test_r2score)

Lasso
----------------------------------------
model train accuracy
mae: 5.205260274468427
rmse 6.592500298650879
r2 score 0.8072231322208645
model test accuracy 
mae: 5.155701094273798
rmse 6.517328221922997
r2 score 0.8254465092551198
Ridge
----------------------------------------
model train accuracy
mae: 4.265005112727168
rmse 5.323320560617112
r2 score 0.8743044675204545
model test accuracy 
mae: 4.211112826071163
rmse 5.390417574427311
r2 score 0.8805917946912825
RandomForestReg
----------------------------------------
model train accuracy
mae: 1.8323406249999998
rmse 2.298383108181071
r2 score 0.9765685348408074
model test accuracy 
mae: 4.608438095238096
rmse 5.9988848625477305
r2 score 0.8521128617838192
LinearRegression
----------------------------------------
model train accuracy
mae: 4.266711846071956
rmse 5.323050852720514
r2 score 0.8743172040139593
model test accuracy 
mae: 4.21476314247485
rmse 5.393993869732841
r2 score 0.8804332983749565
AdaBoostRegressor
------------

In [39]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=['model','r2_score']).sort_values(by=['r2_score'],ascending=False)

Unnamed: 0,model,r2_score
1,Ridge,0.880592
3,LinearRegression,0.880433
2,RandomForestReg,0.852113
8,catboost,0.851831
4,AdaBoostRegressor,0.845124
0,Lasso,0.825447
9,Xgboost,0.821221
7,K nearest,0.783958
5,DecisionTreeRegressor,0.752978
6,SVR,0.729105
