In [20]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score,f1_score,accuracy_score,mean_absolute_error,root_mean_squared_error
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')



In [21]:
df = pd.read_csv('data/stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [22]:
y = df["math_score"]
X = df.drop(columns=["math_score"])


In [23]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ]
)

preprocessor.fit_transform(X)


array([[ 0.19399858,  0.39149181,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       [ 1.42747598,  1.31326868,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.77010859,  1.64247471,  1.        , ...,  1.        ,
         0.        ,  1.        ],
       ...,
       [ 0.12547206, -0.20107904,  1.        , ...,  0.        ,
         1.        ,  0.        ],
       [ 0.60515772,  0.58901542,  1.        , ...,  1.        ,
         1.        ,  0.        ],
       [ 1.15336989,  1.18158627,  1.        , ...,  0.        ,
         0.        ,  1.        ]])

In [24]:
# Define models
models = {
    "Linear Regression": Pipeline([
        ('preprocessor', preprocessor),
        ('model', LinearRegression())
    ]),
    "Random Forest": Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestRegressor(random_state=42))
    ]),
    "XGBoost": Pipeline([
        ('preprocessor', preprocessor),
        ('model', XGBRegressor(random_state=42))
    ])
}

In [25]:
import time
# Dictionary to store results
results = {
    "Model": [],
    "Training Time (s)": [],
    "Mean CV R² Score": [],
    "Std CV R² Score": [],

    "Root Mean Squared Error": []
}

# Perform cross-validation and evaluate
for model_name, pipeline in models.items():
    print(f"Evaluating {model_name}...")

    # Measure training time (fit on training set)
    start_time = time.time()
    pipeline.fit(X, y)
    training_time = time.time() - start_time

    # Cross-validation on the full dataset
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores = np.sqrt(-cv_scores)  # Convert to RMSE
    rmse = np.mean(cv_scores)
    mean_cv_score = cv_scores.mean()
    std_cv_score = cv_scores.std()

   

    # Store results
    results["Model"].append(model_name)
    results["Training Time (s)"].append(training_time)
    results["Mean CV R² Score"].append(mean_cv_score)
    results["Std CV R² Score"].append(std_cv_score)

    results["Root Mean Squared Error"].append(rmse)

Evaluating Linear Regression...
Evaluating Random Forest...
Evaluating XGBoost...


In [27]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Training Time (s),Mean CV R² Score,Std CV R² Score,Root Mean Squared Error
0,Linear Regression,0.007518,5.395268,0.188314,5.395268
1,Random Forest,0.25355,6.122427,0.274652,6.122427
2,XGBoost,0.057991,6.569533,0.234258,6.569533
