In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import  XGBRegressor
import warnings
warnings.filterwarnings('ignore')


In [5]:
data = pd.read_csv("data/stud.csv")
data.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
data['total_score'] = data.math_score + data.reading_score + data.writing_score

In [7]:
#Independent Features
X = data.drop(columns='total_score',axis=1)
X.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [8]:
y = data['total_score']

In [9]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

num_transform = StandardScaler()
encoder = OneHotEncoder()

preprocess = ColumnTransformer(
    [
        ('OneHotEncoder',encoder,cat_features),
        ('StandardScaler',num_transform,num_features)
    ]
)



In [11]:
X = preprocess.fit_transform(X)

In [12]:
X.shape

(1000, 20)

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 20), (200, 20), (800,), (200,))

In [14]:
def evaluate_model(true, prediction):
    mae = mean_absolute_error(true, prediction)
    mse = mean_squared_error(true, prediction)
    rmse = np.sqrt(mean_squared_error(true, prediction))
    r2 = r2_score(true, prediction)
    return mae, mse, rmse, r2


In [22]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso Regression": Lasso(),
    "Ridge Regression": Ridge(),
    "KNN Regression": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "XGB Regression": XGBRegressor(),
    "CatBoost": CatBoostRegressor(verbose=False),
    "Adaboost": AdaBoostRegressor()
}


In [29]:
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #Evaluate Train and Test
    train_mae, train_mse, train_rmse, train_r2 = evaluate_model(y_train,y_train_pred)
    
    test_mae, test_mse, test_rmse, test_r2 = evaluate_model(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance on Train: ")
    print("MAE : {:.4f}".format(train_mae))
    print("MSE : {:.4f}".format(train_mse))
    print("RMSE : {:.4f}".format(train_rmse))
    print("R2 : {:.4f}".format(train_r2))
    print("-"*10)
    print("Model Performance on Test: ")
    print("MAE : {:.4f}".format(test_mae))
    print("MSE : {:.4f}".format(test_mse))
    print("RMSE : {:.4f}".format(test_rmse))
    print("R2 : {:.4f}".format(test_r2))
    r2_list.append([list(models.keys())[i],test_mae,test_mse,test_rmse,test_r2])
    print("*"*35)

Linear Regression
Model Performance on Train: 
MAE : 0.0000
MSE : 0.0000
RMSE : 0.0000
R2 : 1.0000
----------
Model Performance on Test: 
MAE : 0.0000
MSE : 0.0000
RMSE : 0.0000
R2 : 1.0000
***********************************
Lasso Regression
Model Performance on Train: 
MAE : 0.8505
MSE : 1.1330
RMSE : 1.0644
R2 : 0.9994
----------
Model Performance on Test: 
MAE : 0.8764
MSE : 1.2409
RMSE : 1.1139
R2 : 0.9994
***********************************
Ridge Regression
Model Performance on Train: 
MAE : 0.0195
MSE : 0.0006
RMSE : 0.0241
R2 : 1.0000
----------
Model Performance on Test: 
MAE : 0.0201
MSE : 0.0007
RMSE : 0.0264
R2 : 1.0000
***********************************
KNN Regression
Model Performance on Train: 
MAE : 5.3040
MSE : 45.8584
RMSE : 6.7719
R2 : 0.9745
----------
Model Performance on Test: 
MAE : 6.8500
MSE : 84.8588
RMSE : 9.2119
R2 : 0.9560
***********************************
Decision Tree
Model Performance on Train: 
MAE : 0.0000
MSE : 0.0000
RMSE : 0.0000
R2 : 1.0000
----

In [30]:
model_performance = pd.DataFrame(r2_list)

In [31]:
model_performance

Unnamed: 0,0,1,2,3,4
0,Linear Regression,3.858247e-14,2.62028e-27,5.118867e-14,1.0
1,Lasso Regression,0.8764329,1.24088,1.113948,0.999357
2,Ridge Regression,0.0200899,0.0006988866,0.02643646,1.0
3,KNN Regression,6.85,84.8588,9.211884,0.956016
4,Decision Tree,3.39,30.93,5.561475,0.983968
5,Random Forest,1.58365,12.33548,3.51219,0.993606
6,XGB Regression,1.65689,10.11101,3.179781,0.994759
7,CatBoost,0.9921136,12.62252,3.552819,0.993457
8,Adaboost,4.521159,41.6123,6.45076,0.978431
