## 1.Environment Setup & Imports

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

from surprise import NMF
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from sklearn.metrics import r2_score, precision_score, recall_score, f1_score
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Define path
data_path = '/home/sysadmin/ResearchProject-Experiments/datasets/student_grade.csv'



In [None]:
df = pd.read_csv(data_path, low_memory=False)
df

## 2.Data Loading & Preprocessing

In [None]:
# Check if file exists
if not os.path.exists(data_path):
    print(f"Error: The file '{data_path}' was not found.")
else:
    print("File found. Loading data...")

    # === 2.1 Load Data ===
    df = pd.read_csv(data_path, low_memory=False)

    # === 2.2 Transform Data (Wide to Long) ===
    id_vars = ['student_id']
    df_long = pd.melt(df, id_vars=id_vars, var_name='course', value_name='grade')

    # === 2.3 Clean Data ===
    # Convert grade to numeric and remove invalid/empty grades
    df_long['grade'] = pd.to_numeric(df_long['grade'], errors='coerce')
    df_long_cleaned = df_long[(df_long['grade'] > 0.0) & (df_long['grade'].notna())].copy()

    # === 2.4 Filter for 'INT' Courses Only ===
    # This ensures the model only learns from INT courses
    df_long_filtered = df_long_cleaned[df_long_cleaned['course'].astype(str).str.startswith('INT')].copy()

    print(f"--- Data Preparation Complete ---")
    print(f"Total records after cleaning: {len(df_long_cleaned)}")
    print(f"Filtered to INT courses only: {len(df_long_filtered)}")
    display(df_long_filtered)


## 3.Load Data and adjust rating scale

In [None]:
# Define rating scale (assuming grades are 1.0 to 4.0)
reader = Reader(rating_scale=(1, 4))
data = Dataset.load_from_df(df_long_filtered[['student_id', 'course', 'grade']], reader)

## 4.Parameter Tuning

In [15]:
param_grid = {
    'n_factors': [90, 110, 120, 130],
    'n_epochs':  [70, 90, 110],
    # 'n_factors': [50, 70, 90, 110],
    # 'n_epochs': [80, 90, 100],
    'reg_pu':    [0.04, 0.06, 0.08], # Regularization ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö user latent factors
    'reg_qi':    [0.04, 0.06, 0.08] # Regularization ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö item latent factors
}

gs = GridSearchCV(
    NMF,
    param_grid,
    measures=['rmse', 'mae'],
    cv=5,    # fold cross-validation
    joblib_verbose=3
)

print("üöÄ Start GridSearchCV (NMF)...")
gs.fit(data)
print("‚úÖ GridSearchCV Finished")

üöÄ Start GridSearchCV (NMF)...


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:   15.5s
[Parallel(n_jobs=1)]: Done 127 tasks      | elapsed:  1.3min
[Parallel(n_jobs=1)]: Done 287 tasks      | elapsed:  3.3min
[Parallel(n_jobs=1)]: Done 511 tasks      | elapsed:  6.3min


‚úÖ GridSearchCV Finished


[Parallel(n_jobs=1)]: Done 540 out of 540 | elapsed:  6.8min finished


In [16]:
print("Best RMSE score:", gs.best_score['rmse'])
print("Best params for RMSE:")
print(gs.best_params['rmse'])

print("\nBest MAE score:", gs.best_score['mae'])
print("Best params for MAE:")
print(gs.best_params['mae'])

Best RMSE score: 0.5542039872177911
Best params for RMSE:
{'n_factors': 120, 'n_epochs': 110, 'reg_pu': 0.04, 'reg_qi': 0.04}

Best MAE score: 0.43490108271126005
Best params for MAE:
{'n_factors': 110, 'n_epochs': 110, 'reg_pu': 0.04, 'reg_qi': 0.04}


### RMSE DataFram

In [None]:
# 1. ‡πÅ‡∏õ‡∏•‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡∏à‡∏≤‡∏Å Grid Search ‡∏°‡∏≤‡πÄ‡∏õ‡πá‡∏ô DataFrame ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡∏û‡∏•‡∏≠‡∏ï‡∏Å‡∏£‡∏≤‡∏ü‡∏á‡πà‡∏≤‡∏¢
results_df = pd.DataFrame.from_dict(gs.cv_results)

# -------------------------------------------------------
# Graph 1: Effect of Epochs (Learning Curve)
# ‡∏î‡∏π‡∏ß‡πà‡∏≤ "‡∏à‡∏≥‡∏ô‡∏ß‡∏ô‡∏£‡∏≠‡∏ö‡∏Å‡∏≤‡∏£‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ" ‡∏™‡πà‡∏á‡∏ú‡∏•‡∏ï‡πà‡∏≠ Error ‡∏≠‡∏¢‡πà‡∏≤‡∏á‡πÑ‡∏£
# -------------------------------------------------------
plt.figure(figsize=(10, 6))

# ‡πÅ‡∏Å‡∏ô X: ‡∏à‡∏≥‡∏ô‡∏ß‡∏ô Epochs
# ‡πÅ‡∏Å‡∏ô Y: ‡∏Ñ‡πà‡∏≤ Error (RMSE)
# Hue (‡∏™‡∏µ‡πÄ‡∏™‡πâ‡∏ô): ‡πÅ‡∏¢‡∏Å‡∏ï‡∏≤‡∏°‡∏à‡∏≥‡∏ô‡∏ß‡∏ô Factors (‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ã‡∏±‡∏ö‡∏ã‡πâ‡∏≠‡∏ô‡∏Ç‡∏≠‡∏á‡πÇ‡∏°‡πÄ‡∏î‡∏•)
sns.lineplot(data=results_df, x='param_n_epochs', y='mean_test_rmse',
             hue='param_n_factors', marker='o', palette='viridis')

plt.title('Effect of Epochs on RMSE (Learning Curve)')
plt.xlabel('Number of Epochs')
plt.ylabel('RMSE (Lower is Better)')
plt.grid(True, alpha=0.3)
plt.show()

### RMSE Heatmap

In [None]:
# -------------------------------------------------------
# Graph 2: Heatmap (reg_pu vs reg_qi)
# ‡∏î‡∏π‡∏à‡∏∏‡∏î‡∏ó‡∏µ‡πà RMSE ‡∏ï‡πà‡∏≥‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡∏à‡∏≤‡∏Å‡∏Ñ‡πà‡∏≤‡∏Ñ‡∏π‡πà‡∏Ç‡∏≠‡∏á regularization
# -------------------------------------------------------
pivot_table = results_df.pivot_table(
    values='mean_test_rmse',
    index='param_reg_pu',    # ‡πÅ‡∏Å‡∏ô‡∏ï‡∏±‡πâ‡∏á: reg_pu
    columns='param_reg_qi'   # ‡πÅ‡∏Å‡∏ô‡∏ô‡∏≠‡∏ô: reg_qi
)

plt.figure(figsize=(8, 6))
sns.heatmap(
    pivot_table,
    annot=True,
    fmt='.4f',
    cmap='Blues_r'  # ‡∏¢‡∏¥‡πà‡∏á‡πÄ‡∏Ç‡πâ‡∏° = RMSE ‡∏¢‡∏¥‡πà‡∏á‡∏ï‡πà‡∏≥ (‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤)
)

plt.title('RMSE Heatmap (NMF): reg_pu vs reg_qi')
plt.xlabel('reg_qi (item regularization)')
plt.ylabel('reg_pu (user regularization)')
plt.show()