In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [2]:
df = pd.read_csv('data/raw.csv')

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,master's degree,standard,none,62,70,75
1,female,group C,bachelor's degree,free/reduced,completed,66,83,83
2,female,group D,some college,free/reduced,none,79,89,86
3,male,group C,master's degree,free/reduced,none,61,67,66
4,male,group E,high school,standard,none,73,64,57


In [4]:
X = df.drop(columns = ['math_score'])

In [5]:
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,reading_score,writing_score
0,female,group D,master's degree,standard,none,70,75
1,female,group C,bachelor's degree,free/reduced,completed,83,83
2,female,group D,some college,free/reduced,none,89,86
3,male,group C,master's degree,free/reduced,none,67,66
4,male,group E,high school,standard,none,64,57


In [6]:
y=df[['math_score']]

In [7]:
y.head()

Unnamed: 0,math_score
0,62
1,66
2,79
3,61
4,73


In [8]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race_ethnicity               1000 non-null   object
 2   parental_level_of_education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test_preparation_course      1000 non-null   object
 5   reading_score                1000 non-null   int64 
 6   writing_score                1000 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 54.8+ KB


In [9]:
df['gender'].dtype

dtype('O')

In [10]:
cat_features = [cols for cols in X.columns if X[cols].dtype == 'O']
num_features = [cols for cols in X.columns if X[cols].dtype  != 'O']

In [11]:
num_features

['reading_score', 'writing_score']

In [12]:
cat_features

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [13]:
for cols in cat_features:
    print(f"categories in {cols} :", X[cols].unique())

categories in gender : ['female' 'male']
categories in race_ethnicity : ['group D' 'group C' 'group E' 'group B' 'group A']
categories in parental_level_of_education : ["master's degree" "bachelor's degree" 'some college' 'high school'
 "associate's degree" 'some high school']
categories in lunch : ['standard' 'free/reduced']
categories in test_preparation_course : ['none' 'completed']


In [14]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

oh_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()


preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder",oh_transformer,cat_features),
        ("StandardScaler",numeric_transformer,num_features),
    ]
)

In [15]:
X = preprocessor.fit_transform(X)

In [16]:
X.shape

(1000, 19)

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800, 1), (200, 1))

In [18]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [19]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "K-Neighbours Regressor" : KNeighborsRegressor(),
    "Decision Tree" : DecisionTreeRegressor(),
    "Random_Forest_Regressor" : RandomForestRegressor(),
    "XGBRegressor" : XGBRegressor(),
    "CatBoosting Regressor" : CatBoostRegressor(verbose=True),
    "AdaBoost Regressor" : AdaBoostRegressor()
}

model_list = []
r2_list = []


In [20]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 5.4149
- Mean Absolute Error: 4.3139
- R2 Score: 0.8765
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.1307
- Mean Absolute Error: 4.0984
- R2 Score: 0.8674


Lasso
Model performance for Training set
- Root Mean Squared Error: 6.6595
- Mean Absolute Error: 5.2600
- R2 Score: 0.8133
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.9043
- Mean Absolute Error: 4.6669
- R2 Score: 0.8244


Ridge
Model performance for Training set
- Root Mean Squared Error: 5.3987
- Mean Absolute Error: 4.2953
- R2 Score: 0.8773
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.0617
- Mean Absolute Error: 4.0536
- R2 Score: 0.8709


K-Neighbours Regressor
Model performance for Training set
- Root Mean Squared Error: 5.9325
- Mean Absolute Error: 4.7012
- R2 Score: 0.8518
----------------------

  return fit_method(estimator, *args, **kwargs)


Random_Forest_Regressor
Model performance for Training set
- Root Mean Squared Error: 2.3635
- Mean Absolute Error: 1.8913
- R2 Score: 0.9765
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 5.7745
- Mean Absolute Error: 4.6279
- R2 Score: 0.8320


XGBRegressor
Model performance for Training set
- Root Mean Squared Error: 1.0203
- Mean Absolute Error: 0.6601
- R2 Score: 0.9956
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 6.3556
- Mean Absolute Error: 4.9843
- R2 Score: 0.7965


Learning rate set to 0.039525
0:	learn: 14.9946453	total: 68.5ms	remaining: 1m 8s
1:	learn: 14.6000761	total: 69ms	remaining: 34.5s
2:	learn: 14.2228937	total: 69.6ms	remaining: 23.1s
3:	learn: 13.8503211	total: 70.2ms	remaining: 17.5s
4:	learn: 13.5159598	total: 70.8ms	remaining: 14.1s
5:	learn: 13.2180250	total: 71.3ms	remaining: 11.8s
6:	learn: 12.9077780	total: 71.8ms	remaining: 10.2s
7:	learn: 12.6345186	total: 72.2m

  y = column_or_1d(y, warn=True)


In [21]:
model_list

['Linear Regression',
 'Lasso',
 'Ridge',
 'K-Neighbours Regressor',
 'Decision Tree',
 'Random_Forest_Regressor',
 'XGBRegressor',
 'CatBoosting Regressor',
 'AdaBoost Regressor']

In [22]:
r2_list

[0.8673906622176597,
 0.8243854143897048,
 0.8709324150407662,
 0.772570939342956,
 0.6825772250796233,
 0.8320180306815987,
 0.7965084910392761,
 0.8397003155824867,
 0.8194513964596447]

In [23]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.870932
0,Linear Regression,0.867391
7,CatBoosting Regressor,0.8397
5,Random_Forest_Regressor,0.832018
1,Lasso,0.824385
8,AdaBoost Regressor,0.819451
6,XGBRegressor,0.796508
3,K-Neighbours Regressor,0.772571
4,Decision Tree,0.682577
