In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")


In [30]:
df = pd.read_csv("data/stud.csv")

In [31]:
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [32]:
X = df.drop(columns=["math score"],axis=1)
y = df["math score"]

In [33]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

num_transformer = StandardScaler()
ohe = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncode",ohe,cat_features),
        ("StandardScaler", num_transformer,num_features)
    ]
)

In [34]:
X = preprocessor.fit_transform(X)

In [35]:
X.shape

(1000, 19)

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)
X_train.shape,X_test.shape

((800, 19), (200, 19))

In [37]:
def eval_model(y,y_pred):
    mae = mean_absolute_error(y,y_pred)
    mse = mean_squared_error(y,y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y,y_pred)
    return mae,rmse,r2

In [38]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "Knn Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "XGB Regressor": XGBRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    train_mae,train_rmse,train_r2 = eval_model(y_train,y_train_pred)
    test_mae,test_rmse,test_r2 = eval_model(y_test,y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Training Set")
    print(f"Root mean squared error: {train_rmse:.3f}")
    print(f"Mean Absolute Error error: {train_mae:.3f}")
    print(f"R^2: {train_r2:.3f}")

    print("------------------------------")

    print("Model Performance for Test Set")
    print(f"Root mean squared error: {test_rmse:.3f}")
    print(f"Mean Absolute Error error: {test_mae:.3f}")
    print(f"R^2: {test_r2:.3f}")
    r2_list.append(test_r2)



Linear Regression
Model Performance for Training Set
Root mean squared error: 5.327
Mean Absolute Error error: 4.268
R^2: 0.874
------------------------------
Model Performance for Test Set
Root mean squared error: 5.420
Mean Absolute Error error: 4.226
R^2: 0.879
Lasso
Model Performance for Training Set
Root mean squared error: 6.594
Mean Absolute Error error: 5.206
R^2: 0.807
------------------------------
Model Performance for Test Set
Root mean squared error: 6.520
Mean Absolute Error error: 5.158
R^2: 0.825
Ridge
Model Performance for Training Set
Root mean squared error: 5.323
Mean Absolute Error error: 4.265
R^2: 0.874
------------------------------
Model Performance for Test Set
Root mean squared error: 5.390
Mean Absolute Error error: 4.211
R^2: 0.881
Knn Regressor
Model Performance for Training Set
Root mean squared error: 5.708
Mean Absolute Error error: 4.517
R^2: 0.855
------------------------------
Model Performance for Test Set
Root mean squared error: 7.253
Mean Absolut

Random Forest Regressor
Model Performance for Training Set
Root mean squared error: 2.324
Mean Absolute Error error: 1.848
R^2: 0.976
------------------------------
Model Performance for Test Set
Root mean squared error: 5.955
Mean Absolute Error error: 4.568
R^2: 0.854
Decision Tree Regressor
Model Performance for Training Set
Root mean squared error: 0.280
Mean Absolute Error error: 0.019
R^2: 1.000
------------------------------
Model Performance for Test Set
Root mean squared error: 7.799
Mean Absolute Error error: 6.160
R^2: 0.750
XGB Regressor
Model Performance for Training Set
Root mean squared error: 1.007
Mean Absolute Error error: 0.687
R^2: 0.995
------------------------------
Model Performance for Test Set
Root mean squared error: 6.473
Mean Absolute Error error: 5.058
R^2: 0.828
AdaBoost Regressor
Model Performance for Training Set
Root mean squared error: 5.770
Mean Absolute Error error: 4.691
R^2: 0.852
------------------------------
Model Performance for Test Set
Root m

In [39]:
pd.DataFrame(list(zip(model_list,r2_list)),columns=["Model Name","R^2 Score"]).sort_values(by=["R^2 Score"],ascending=False)

Unnamed: 0,Model Name,R^2 Score
2,Ridge,0.880593
0,Linear Regression,0.87926
4,Random Forest Regressor,0.854291
7,AdaBoost Regressor,0.848553
6,XGB Regressor,0.827797
1,Lasso,0.82532
3,Knn Regressor,0.783813
5,Decision Tree Regressor,0.75006


In [40]:
lin_model = LinearRegression()
lin_model.fit(X,y)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test,y_pred)
print(f"R^2 value is {score:.4f}")

R^2 value is 0.8867
