In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder as ore

In [20]:
df = pd.read_csv("cleaned_df.csv")

In [21]:

X = df.drop(columns=["G3"])
y = df['G3']


# Model Building

## First we will define our features and our target

In [22]:
df = pd.read_csv("cleaned_df.csv")

In [23]:
pipe=Pipeline([('ore',ore()),('Normalizer',Normalizer()),('Ridge',Ridge())])


In [24]:
pipe.fit(X,y)
y_pred_norm=pipe.predict(X)
y_pred=denormalize(y_pred_norm)
mse = mean_squared_error(y, y_pred)
print("MSE Score: ",mse)
rs=r2_score(y,y_pred)
print("R^2 Score: ",rs)


NameError: name 'denormalize' is not defined

Here we tried Ridge regression with cross 

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score
import numpy as np
alphas = [0,0.1, 1.0, 10.0]
mean_cv_scores = []
for alpha in alphas:
    # Define the pipeline with Ridge Regression
    pipeline = Pipeline([
        ('ore',ore()),
        ('scaler', StandardScaler()),
        ('ridge', Ridge(alpha=alpha))
    ])
        # Define K-Fold Cross-Validation
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)

    # Perform K-Fold Cross-Validation
    cv_scores = cross_val_score(pipeline, X, y, cv=kfold)
    mean_cv_score = np.mean(cv_scores)
    mean_cv_scores.append(mean_cv_score)
best_alpha = alphas[np.argmax(mean_cv_scores)]
pipe=Pipeline([('ore',ore()),('Normalizer',Normalizer()),('Ridge',Ridge(alpha=best_alpha))])
pipe.fit(X,y)
y_pred_norm=pipe.predict(X)
y_pred=denormalize(y_pred_norm)
mse = mean_squared_error(y, y_pred)
print("MSE Score: ",mse)
rs=r2_score(y,y_pred)
print("R^2 Score: ",rs)


NameError: name 'denormalize' is not defined

Here we tried only linear regression with the feature we made from previous grades

In [55]:
X=df[['avg_Grade']]
y=df['G3']
x_train, x_test, y_train, y_test = train_test_split(X, y ,random_state = 104, train_size=0.8, shuffle=True) 

model=LinearRegression()
model.fit(x_train,y_train)
y_pred=model.predict(x_test)
# Calculate mean and standard deviation of MSE and R2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean MSE: {:.2f}".format(mse))
print("Mean R^2: {:.2f}".format(r2))


y_pred_train = model.predict(x_train)
train_r2 = r2_score(y_train, y_pred_train)
print("Train R^2 Score:", train_r2)

# Evaluate the best model on the test set
y_pred_test = model.predict(x_test)
test_r2 = r2_score(y_test, y_pred_test)
print("Test R^2 Score:", test_r2)

Mean MSE: 0.93
Mean R^2: 0.88
Train R^2 Score: 0.8775947730876035
Test R^2 Score: 0.8791414363302177


Trying Linear Regression with encoding categorical variables

In [None]:
l = []
cat = []

for i in df:
    if (type(df[i][0]) != str):
        l.append(i)
    else:
        cat.append(i)
        
l.remove("G3")
x = df[l].copy()
cat = df[cat].copy() # cat is df

for i in cat:
    unique = cat[i].unique()
    uniqueDict = dict()
      
    for c in range(len(unique)):
        uniqueDict[unique[c]] = c
        
    cat[i] = cat[i].apply(lambda j: uniqueDict[j])
        
    # for j in cat[i].keys():
    #     cat.loc[j, i] = uniqueDict[cat[i][j]]
        
    cat[i] = cat[i].astype("int64")
    x[i] = cat[i].copy()

for i in x:
    u = np.mean(x[i])
    sigma = np.std(x[i])
    x[i] = (x[i] - u) / sigma

y = df["G3"].copy()

for i in y.keys():
    u = np.mean(y)
    sigma = np.std(y)
    y[i] = (y[i] - u) / sigma

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y ,random_state = 104, train_size=0.8, shuffle=True) 

regr = LinearRegression(fit_intercept = True)
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

print("MSE Score: ",mean_squared_error(y_test, y_pred))

y_pred_train = regr.predict(x_train)
train_r2 = r2_score(y_train, y_pred_train)
print("Train R^2 Score:", train_r2)

# Evaluate the best model on the test set
y_pred_test = regr.predict(x_test)
test_r2 = r2_score(y_test, y_pred_test)
print("Test R^2 Score:", test_r2)


MSE Score:  0.47910931388044553
Train R^2 Score: 0.7088781113732967
Test R^2 Score: 0.7548654187190122


Here we selected the columns with the highest correlation with the target and the rejected column from our hypothesis testing

In [None]:
from sklearn.preprocessing import LabelEncoder
df_copy = df.copy()
binary_encoder = LabelEncoder()
# These are the columns affecting the target from our hypothesis testing
columns = ['school', 'sex', 'address', 'schoolsup', 'higher', 'internet']
for col in columns:
    df_copy[f'{col}2'] = binary_encoder.fit_transform(df[f'{col}'])
df_copy.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,health,absences,G3,avg_Grade,school2,sex2,address2,schoolsup2,higher2,internet2
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,3,4,11,5.5,0,0,1,1,1,0
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,3,2,11,10.0,0,0,1,0,1,1
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,3,6,12,12.5,0,0,1,1,1,1
3,GP,F,15,U,GT3,T,4,2,health,services,...,5,0,14,14.0,0,0,1,0,1,1
4,GP,F,16,U,GT3,T,3,3,other,other,...,5,0,13,12.0,0,0,1,0,1,0


In [None]:
columns2=[f"{c}2"for c in columns]
# These features have strong correlation with our targeet variable
columns2.extend(['avg_Grade',"Dalc","failures","Medu","studytime"])
columns2

['school2',
 'sex2',
 'address2',
 'schoolsup2',
 'higher2',
 'internet2',
 'avg_Grade',
 'Dalc',
 'goout']

Applying Grid search to test different parameters for our ridge model

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

X = df_copy[columns2]
y = df_copy['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Ridge()

param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best MSE Score:", -grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", test_mse)

y_pred_train = best_model.predict(X_train)
train_r2 = r2_score(y_train, y_pred_train)
print("Train R^2 Score:", train_r2)

# Evaluate the best model on the test set
y_pred_test = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
print("Test R^2 Score:", test_r2)



Best Parameters: {'alpha': 10.0, 'solver': 'auto'}
Best MSE Score: 0.7923526342395653
Test MSE: 0.9845253001586696
Train R^2 Score: 0.8847534825386126
Test R^2 Score: 0.8691424987541134


Trying KNeighborsRegressor with Grid search to find the best parameters

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error,r2_score

# Assuming df_copy contains your DataFrame and columns2 are the features
X = df_copy[columns2]
y = df_copy['G3']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create KNN Regressor model
model = KNeighborsRegressor()

# Define hyperparameters to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best parameters and best MSE score
print("Best Parameters:", grid_search.best_params_)
print("Best MSE Score:", -grid_search.best_score_)

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
test_mse = mean_squared_error(y_test, y_pred)
print("Test MSE:", test_mse)


Best Parameters: {'n_neighbors': 11, 'p': 2, 'weights': 'uniform'}
Best MSE Score: 0.8918353063807609
Test MSE: 0.9658757664622767


In [None]:
y_pred_train = best_model.predict(X_train)
train_r2 = r2_score(y_train, y_pred_train)
print("Train R^2 Score:", train_r2)

# Evaluate the best model on the test set
y_pred_test = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
print("Test R^2 Score:", test_r2)


Train R^2 Score: 0.8868521619706006
Test R^2 Score: 0.8716212886628314


In [26]:
pip install xgboost




In [27]:
numerical_columns = df.select_dtypes(include=['int64','Float64'])


In [28]:
X = numerical_columns.drop(columns=["G3"])
y = numerical_columns["G3"]

In [35]:
X = df_copy[columns2]
y = df_copy['G3']

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Trying Ensemblle Learning boosting techniqe called Xgboost with hyperparameter tuning and Kfold cross validation

In [36]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error


# Define the XGBoost regressor
model = xgb.XGBRegressor()

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1]
}

# Perform cross-validation with hyperparameter tuning
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=20, scoring='neg_mean_squared_error', cv=kfold, verbose=1, random_state=42)

# Fit the model
random_search.fit(X, y)

# Print the best hyperparameters found
print("Best hyperparameters:", random_search.best_params_)

# Print the mean squared error of the best estimator
print("Best MSE:", -random_search.best_score_)

# You can also access the best estimator directly
best_model = random_search.best_estimator_


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best hyperparameters: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.7}
Best MSE: 0.8835028667248265


Using the parameters from our best model

In [41]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize XGBoost Regressor
model = xgb.XGBRegressor(
   n_estimators=100,  
   max_depth=5,       
   learning_rate=0.1, 
   subsample=0.8,     
   colsample_bytree=0.8, 
   random_state=42    
)

# Use a list to store scores
mse_scores = []
r2_scores = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate mean and standard deviation of MSE and R2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean MSE: {:.2f}".format(mse))
print("Mean R^2: {:.2f}".format(r2))


y_pred_train = model.predict(X_train)
train_r2 = r2_score(y_train, y_pred_train)
print("Train R^2 Score:", train_r2)

# Evaluate the best model on the test set
y_pred_test = model.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
print("Test R^2 Score:", test_r2)


Mean MSE: 0.87
Mean R^2: 0.88
Train R^2 Score: 0.9519134836436799
Test R^2 Score: 0.8848766217407855


The final model appears to be overfitting the training data

In [52]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score

# Initialize XGBoost Regressor
model = xgb.XGBRegressor(
   n_estimators=100,  
   max_depth=3,       
   learning_rate=0.1, 
   subsample=0.9,    
   reg_alpha=5, 
   reg_lambda=5,
   colsample_bytree=0.7, 
   random_state=42    
)

# Use a list to store scores
mse_scores = []
r2_scores = []

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate mean and standard deviation of MSE and R2
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean MSE: {:.2f}".format(mse))
print("Mean R^2: {:.2f}".format(r2))


y_pred_train = model.predict(X_train)
train_r2 = r2_score(y_train, y_pred_train)
print("Train R^2 Score:", train_r2)

# Evaluate the best model on the test set
y_pred_test = model.predict(X_test)
test_r2 = r2_score(y_test, y_pred_test)
print("Test R^2 Score:", test_r2)


Mean MSE: 0.86
Mean R^2: 0.89
Train R^2 Score: 0.8956720369667234
Test R^2 Score: 0.8852107687449512


After  adding reguralization the overfitting is much less