In [None]:
# Import necessary libraries

# General Data Science libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Preprocessing and Linear Regression libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder


# Libraries for the neural network
import torch
import torch.nn as nn
from  torch.optim import SGD
from torch.optim.lr_scheduler import LinearLR

# Pandas settings
pd.set_option('display.max_columns', None)
pd.options.mode.copy_on_write = True

In [None]:
# Clean and preprocess the data by removing missing values and encoding categorical variables
def preprocess_data():
    # Loading the data
    import kagglehub
    # Get the path to the CSV file
    path = kagglehub.dataset_download("lainguyn123/student-performance-factors")
    path = path + '/StudentPerformanceFactors.csv'
    # Read the CSV file
    student_performance_factors_original = pd.read_csv(path)
    # Drop rows with missing values
    miss_cat_cols = ["Teacher_Quality","Parental_Education_Level","Distance_from_Home"]
    student_performance_factors_original = student_performance_factors_original.dropna(subset=miss_cat_cols)

    # Encode categorical variables (convert them to numeric)
    # Encode ordinal categorical variables using a custom mapping
    Ordinal_cols = ["Parental_Involvement","Access_to_Resources","Motivation_Level","Family_Income","Teacher_Quality","Distance_from_Home",
                    "Peer_Influence", "Parental_Education_Level"]
    ord_enc = OrdinalEncoder(categories=[
        ["Low", "Medium", "High"],
        ["Low", "Medium", "High"],
        ["Low", "Medium", "High"],
        ["Low", "Medium", "High"],
        ["Low", "Medium", "High"],
        ["Near", "Moderate", "Far"],
        ["Negative", "Neutral", "Positive"],
        ["High School", "College", "Postgraduate"]
    ])

    student_performance_factors_original[Ordinal_cols] = ord_enc.fit_transform(student_performance_factors_original[Ordinal_cols])

    # Encode binary categorical variables
    binary_cols = ["Extracurricular_Activities","Internet_Access","Learning_Disabilities","Gender","School_Type"]
    for col in binary_cols:
        le = LabelEncoder()
        student_performance_factors_original[col] = le.fit_transform(student_performance_factors_original[col])

    return student_performance_factors_original


In [None]:

def split_data(X, y, test_size=0.2, val_size=0.2):
    from sklearn.model_selection import train_test_split
    # Split into train+validation and test
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    # Split train+validation into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=val_size, random_state=42)
    return X_train, X_val, X_test, y_train, y_val, y_test


In [None]:

def train_and_evaluate_linear_regression(X_train, y_train, X_val, y_val, X_test, y_test):
    # Train Linear Regression
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    # Get coefficients and intercept
    coefficients = lr.coef_
    intercept = lr.intercept_

    # Get predictions
    y_val_pred = lr.predict(X_val)
    y_test_pred = lr.predict(X_test)

    # Get metrics on validation set
    rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
    mae_val = mean_absolute_error(y_val, y_val_pred)
    mad_val = median_absolute_error(y_val, y_val_pred)
    r2_val = r2_score(y_val, y_val_pred)
    corr_val = np.corrcoef(y_val, y_val_pred)[0, 1]  # Pearson correlation

    # Get metrics on test set
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mae_test = mean_absolute_error(y_test, y_test_pred)
    mad_test = median_absolute_error(y_test, y_test_pred)
    r2_test = r2_score(y_test, y_test_pred)
    corr_test = np.corrcoef(y_test, y_test_pred)[0, 1]  # Pearson correlation

    # Return the metrics
    metrics = {
        "validation": {
            "rMSE": rmse_val,
            "MAE": mae_val,
            "MAD": mad_val,
            "Corr": corr_val,
            "R2": r2_val
        },
        "test": {
            "rMSE": rmse_test,
            "MAE": mae_test,
            "MAD": mad_test,
            "Corr": corr_test,
            "R2": r2_test
        }
    }

    return (metrics, coefficients, intercept)


In [None]:
# Linear regression workflow pipeline
def linear_regression_workflow(features):
  student_performance_factors_original = preprocess_data()
  X_train, X_val, X_test, y_train, y_val, y_test = split_data(student_performance_factors_original[features],
                                                    student_performance_factors_original["Exam_Score"])
  scaler_x = StandardScaler()
  X_train = scaler_x.fit_transform(X_train)
  X_val = scaler_x.transform(X_val)
  X_test = scaler_x.transform(X_test)
  lr_output = train_and_evaluate_linear_regression(X_train, y_train, X_val, y_val, X_test, y_test)
  return lr_output

In [None]:
# Neural network workflow pipeline
def neural_network_workflow(features, learning_rate, num_epochs):
  student_performance_factors_original = preprocess_data()
  if features == "all":
    features = student_performance_factors_original.columns.drop("Exam_Score")
  X = student_performance_factors_original[features]
  y = student_performance_factors_original["Exam_Score"].values.reshape(-1, 1)
  # Standardize the data for the neural network
  scaler_x = StandardScaler()
  scaler_y = StandardScaler()
  X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)
  X_train = scaler_x.fit_transform(X_train)
  X_val = scaler_x.transform(X_val)
  X_test = scaler_x.transform(X_test)
  y_train = scaler_y.fit_transform(y_train).flatten()
  y_val = scaler_y.transform(y_val).flatten()
  y_test = scaler_y.transform(y_test).flatten()
  # Toggle whether to find the learning rate or run the actual neural network algorithm by commenting out one of the below lines
  # find_learning_rate(X_train, y_train)
  nn_metrics = train_evaluate_neural_network(X_train, y_train, X_val, y_val, X_test, y_test, scaler_y, learning_rate, num_epochs)
  return nn_metrics

In [None]:
# Features chosen from stepwise variable selection earlier
features = ['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Previous_Scores', 'Tutoring_Sessions',
       'Family_Income', 'Peer_Influence', 'Parental_Education_Level']

# Train LR model using these features and get the metrics
print("LR Metrics:")
lr_data = linear_regression_workflow(features)
print(lr_data[0]["test"])
coeffs = lr_data[1]
for i in range(len(features)):
    print(f"{features[i]}: {coeffs[i]}")
print(f"Intercept: {lr_data[2]}")

# Train the neural network on all features and get the metrics
# (We tried training it on the same 9 features above but the
# performance was worse, so it's better to use all the features)
print("NN Metrics:")
print(neural_network_workflow("all", learning_rate=0.13804, num_epochs=100)["test"])
# neural_network_workflow("all", learning_rate=0.00055, num_epochs=100)

LR Metrics:
{'rMSE': 2.1611559733001457, 'MAE': 0.8422462227654068, 'MAD': 0.5769952035716202, 'Corr': 0.8363517293644817, 'R2': 0.6994338463965144}
Hours_Studied: 1.7701579861122858
Attendance: 2.279441638910965
Parental_Involvement: 0.6970962855836211
Access_to_Resources: 0.7007387730024464
Previous_Scores: 0.705267771248755
Tutoring_Sessions: 0.6159218904227751
Family_Income: 0.4138116742517593
Peer_Influence: 0.3865435634220288
Parental_Education_Level: 0.3928291806312596
Intercept: 67.22788532222495
NN Metrics:
{'rMSE': 2.145385760251829, 'MAE': 0.6705396945199996, 'MAD': 0.3925056457519531, 'Corr': 0.840460670230925, 'R2': 0.7038043760492817}


In [None]:

def train_evaluate_neural_network(x_train, y_train, X_val, y_val, X_test, y_test, scaler_y, learning_rate, num_epochs):

  # Train the model
  input_size = x_train.shape[1]
  output_size = 1

  model = nn.Sequential(
      nn.Linear(input_size, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, output_size)
  )

  # Batch normalization
#   model = nn.Sequential(
#     nn.Linear(input_size, 64),
#     nn.ReLU(),
#     nn.BatchNorm1d(64),
#     nn.Linear(64, 32),
#     nn.ReLU(),
#     nn.BatchNorm1d(32),
#     nn.Linear(32, 1)
# )

  # Dropout
#   model = nn.Sequential(
#     nn.Linear(input_size, 64),
#     nn.ReLU(),
#     nn.Dropout(0.5),
#     nn.Linear(64, 32),
#     nn.ReLU(),
#     nn.Dropout(0.5),
#     nn.Linear(32, 1)
# )

  # Try adding more layers to the network
#   model = nn.Sequential(
#     nn.Linear(input_size, 128),
#     nn.ReLU(),
#     nn.Linear(128, 64),
#     nn.ReLU(),
#     nn.Linear(64, 32),
#     nn.ReLU(),
#     nn.Linear(32, 1)
# )


  # Use SGD or Adam as the optimizer
  optimizer = SGD(model.parameters(), learning_rate)
  # Norm penalty
  # optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)
  # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  epochs = num_epochs
  b_size = 32
  steps_per_epoch = len(x_train) // b_size

  total_iters = len(x_train) // b_size * epochs

  for epoch in range(epochs):
      idx = np.random.permutation(len(x_train))
      totalLoss = 0
      for step in range(steps_per_epoch):
          start_idx = step * b_size
          end_idx = start_idx + b_size

          x_batch = x_train[idx[start_idx:end_idx]]
          idxx = idx[start_idx:end_idx]

          y_batch = y_train[idx[start_idx:end_idx]]
          x_batch_tensor = torch.tensor(x_batch, dtype=torch.float32)
          y_batch_tensor = torch.tensor(y_batch, dtype=torch.float32).unsqueeze(1)  # Ensure shape (batch_size, 1)

          pred = model(x_batch_tensor)
          loss = nn.MSELoss()(pred, y_batch_tensor)

          optimizer.zero_grad()
          loss.backward()
          optimizer.step()

          totalLoss += loss.item()

  # Evaluation on the test set
  model.eval() # Set model to evaluation mode
  x_test_tensor = torch.tensor(X_test, dtype=torch.float32)
  with torch.no_grad():
      y_test_pred = model(x_test_tensor).numpy()

  # Unscale predictions and true values in order to have the metrics
  # on the same scale to be able to compare linear regression and
  # the neural network
  y_test_pred_original = scaler_y.inverse_transform(y_test_pred).flatten()
  y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()

  # Calculate metrics on the test set
  rmse_test = np.sqrt(mean_squared_error(y_test_original, y_test_pred_original))
  mae_test = mean_absolute_error(y_test_original, y_test_pred_original)
  mad_test = median_absolute_error(y_test_original, y_test_pred_original)
  r2_test = r2_score(y_test_original, y_test_pred_original)
  corr_test = np.corrcoef(y_test_original, y_test_pred_original)[0, 1]  # Pearson correlation

  # Metrics
  metrics = {
      "test": {
          "rMSE": rmse_test,
          "MAE": mae_test,
          "MAD": mad_test,
          "Corr": corr_test,
          "R2": r2_test,
      }
  }
  return metrics

In [None]:
def find_learning_rate(x_train, y_train):
  from torch.utils.data import TensorDataset, DataLoader
  input_size = x_train.shape[1]
  output_size = 1

  # Experiment with different learning rates using the method covered during lecture
  # (initializing to a small value then multiplying by a multiplier)

  model = nn.Sequential(
      nn.Linear(input_size, 64),
      nn.ReLU(),
      nn.Linear(64, 32),
      nn.ReLU(),
      nn.Linear(32, output_size)
  )

  # Batch normalization
  #   model = nn.Sequential(
  #     nn.Linear(input_size, 64),
  #     nn.ReLU(),
  #     nn.BatchNorm1d(64),
  #     nn.Linear(64, 32),
  #     nn.ReLU(),
  #     nn.BatchNorm1d(32),
  #     nn.Linear(32, 1)
  # )

  # Dropout
  #   model = nn.Sequential(
  #     nn.Linear(input_size, 64),
  #     nn.ReLU(),
  #     nn.Dropout(0.5),
  #     nn.Linear(64, 32),
  #     nn.ReLU(),
  #     nn.Dropout(0.5),
  #     nn.Linear(32, 1)
  # )

  # Try adding more layers to the network
  #   model = nn.Sequential(
  #     nn.Linear(input_size, 128),
  #     nn.ReLU(),
  #     nn.Linear(128, 64),
  #     nn.ReLU(),
  #     nn.Linear(64, 32),
  #     nn.ReLU(),
  #     nn.Linear(32, 1)
  # )



  # Set the initial learning rate
  initial_lr = 1e-5

  # Use SGD or Adam as the optimizer
  optimizer = SGD(model.parameters(), lr = initial_lr)
  # optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr)
  # Norm penalty
  # optimizer = SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)


  batch_size = 32
  num_iterations = 100
  final_lr = 10**1
  lr_multiplier = (final_lr / initial_lr) ** (1 / num_iterations)
  print(f"Learning Rate Multiplier per Iteration: {lr_multiplier:.5f}")

  train_dataset = TensorDataset(torch.tensor(x_train, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32).unsqueeze(1))
  train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

  lrs = []
  losses = []

  train_iter = iter(train_loader)

  for iteration in range(1, num_iterations + 1):
      try:
          x_batch, y_batch = next(train_iter)
      except StopIteration:
          train_iter = iter(train_loader)
          x_batch, y_batch = next(train_iter)

      pred = model(x_batch)
      loss = nn.MSELoss()(pred, y_batch)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      current_lr = optimizer.param_groups[0]['lr']
      lrs.append(current_lr)
      losses.append(loss.item())

      if iteration % 10 == 0:
          print(f"Iteration {iteration}, Loss: {loss.item():.4f}, Learning Rate: {current_lr:.5f}")

      new_lr = current_lr * lr_multiplier
      for param_group in optimizer.param_groups:
          param_group['lr'] = new_lr

  lr_loss_df = pd.DataFrame({"Learning Rate": lrs, "Loss": losses})

  # Plot learning rate vs. loss
  fig = px.line(lr_loss_df, x="Learning Rate", y="Loss", title="Learning Rate Finder")
  fig.update_xaxes(type="log", title="Learning Rate (log scale)")
  fig.update_yaxes(title="Loss", range=[0, 10])  # Restrict y-axis to [0, 10]
  fig.show()

  return lrs, losses
