## 1.Environment Setup & Imports

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from surprise import NMF
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from sklearn.metrics import r2_score, precision_score, recall_score, f1_score
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Define path
data_path = '../datasets/student_grade.csv'

In [None]:
df = pd.read_csv(data_path, low_memory=False)
df

## 2.Data Loading & Preprocessing

In [None]:
# === 2.1 Load Data ===
df = pd.read_csv(data_path, low_memory=False)

# === 2.2 Transform Data (Wide to Long) ===
id_vars = ['student_id']
df_long = pd.melt(df, id_vars=id_vars, var_name='course', value_name='grade')

# === 2.3 Clean Data ===
# Convert grade to numeric and remove invalid/empty grades
df_long['grade'] = pd.to_numeric(df_long['grade'], errors='coerce')
df_long_cleaned = df_long[(df_long['grade'] > 0.0) & (df_long['grade'].notna())].copy()

# === 2.4 Filter for 'INT' Courses Only ===
# This ensures the model only learns from INT courses
df_long_filtered = df_long_cleaned[df_long_cleaned['course'].astype(str).str.startswith('INT')].copy()

print(f"--- Data Preparation Complete ---")
print(f"Total records after cleaning: {len(df_long_cleaned)}")
print(f"Filtered to INT courses only: {len(df_long_filtered)}")
display(df_long_filtered)


## 3.Split Data to train and test set


In [None]:
# === 3.1 Load Data into Surprise Dataset ===
# Define rating scale (assuming grades are 1.0 to 4.0)
reader = Reader(rating_scale=(1, 4))
data = Dataset.load_from_df(df_long_filtered[['student_id', 'course', 'grade']], reader)


# # === 3.2 Split Data ===
trainset, testset = train_test_split(data, test_size=0.30, random_state=42)

## 4.Model Training (NMF)

In [None]:
print("--- Training SVD Model ---")
model = NMF(
    n_factors = 130,
    n_epochs  = 90,
    reg_pu    = 0.04,
    reg_qi   = 0.08,
    random_state = 42
)
# model = NMF(
#     n_factors = 130,
#     n_epochs  = 110,
#     reg_pu    = 0.04,
#     reg_qi   = 0.04,
#     random_state = 42
# )
model.fit(trainset)
print("Training complete.")



## 5.Test and evaluate model

In [None]:
# === 3.4 Evaluate Performance ===
print("\n--- Model Evaluation ---")
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

## 6.Confusion Matrix Classification_report 8 grades

In [None]:
# Mapping: 8 Grades
score_to_letter_8 = {
    0.0: 'F', 1.0: 'D', 1.5: 'D+', 2.0: 'C', 2.5: 'C+',
    3.0: 'B', 3.5: 'B+', 4.0: 'A'
}
valid_scores_8 = sorted(score_to_letter_8.keys())
valid_labels_8 = [score_to_letter_8[s] for s in valid_scores_8]

def get_nearest_grade_8(score):
    return min(valid_scores_8, key=lambda x: abs(x - score))

y_true_8 = [score_to_letter_8[get_nearest_grade_8(p.r_ui)] for p in predictions]
y_pred_8 = [score_to_letter_8[get_nearest_grade_8(p.est)] for p in predictions]

# Plot Heatmap 8 Grades
plt.figure(figsize=(10, 8))
cm_8 = confusion_matrix(y_true_8, y_pred_8, labels=valid_labels_8)
sns.heatmap(cm_8, annot=True, fmt='d', cmap='Blues', xticklabels=valid_labels_8, yticklabels=valid_labels_8)
plt.title('Confusion Matrix: 8 Grades')
plt.xlabel('Predicted Grade')
plt.ylabel('Actual Grade')

print("--- 8-Grade Classification Report ---")
print(classification_report(y_true_8, y_pred_8, target_names=valid_labels_8, labels=valid_labels_8, zero_division=0))

## 6.1 Confusion Matrix Classification_report 5 grades

In [None]:
# Mapping: 5 Grades (Collapsing Plus grades)
score_to_letter_5 = {
    0.0: 'F', 
    1.0: 'D', 1.5: 'D', 
    2.0: 'C', 2.5: 'C',
    3.0: 'B', 3.5: 'B', 
    4.0: 'A'
}
valid_labels_5 = ['F', 'D', 'C', 'B', 'A']
# We still check against the full range of possible scores
valid_scores_all = [0.0, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0]

y_true_5 = [score_to_letter_5[get_nearest_grade_8(p.r_ui)] for p in predictions]
y_pred_5 = [score_to_letter_5[get_nearest_grade_8(p.est)] for p in predictions]

# Plot Heatmap 5 Grades
plt.figure(figsize=(10, 8))
cm_5 = confusion_matrix(y_true_5, y_pred_5, labels=valid_labels_5)
sns.heatmap(cm_5, annot=True, fmt='d', cmap='Greens', xticklabels=valid_labels_5, yticklabels=valid_labels_5)
plt.title('Confusion Matrix: 5 Grades')
plt.xlabel('Predicted Grade')
plt.ylabel('Actual Grade')

print("\n--- 5-Grade Classification Report ---")
print(classification_report(y_true_5, y_pred_5, target_names=valid_labels_5, labels=valid_labels_5, zero_division=0))

## 6.2 Confusion Matrix Classification_report

In [None]:
# === 3.4 Evaluate Performance ===
print("\n--- Model Evaluation ---")
predictions = model.test(testset)

# ----- Regression metrics -----
rmse = accuracy.rmse(predictions, verbose=False)
mae  = accuracy.mae(predictions, verbose=False)

# ดึงค่าจริง/ค่าทำนายออกมา
y_true = [p.r_ui for p in predictions]   # actual grade
y_pred = [p.est  for p in predictions]   # predicted grade

r2 = r2_score(y_true, y_pred)

print(f"RMSE : {rmse:.4f}")
print(f"MAE  : {mae:.4f}")
print(f"R²   : {r2:.4f}")

# ----- Classification-style metrics -----
# แปลงเป็น high / low ด้วย threshold เช่น >= 2.5 = class 1
threshold = 3.0  # หรือ 3.0 ถ้าอยากโหดขึ้น

y_true_cls = [1 if y >= threshold else 0 for y in y_true]
y_pred_cls = [1 if y >= threshold else 0 for y in y_pred]

# Macro = ให้น้ำหนักทุก class เท่ากัน
precision_macro = precision_score(y_true_cls, y_pred_cls, average='macro')
recall_macro    = recall_score(y_true_cls, y_pred_cls, average='macro')
f1_macro        = f1_score(y_true_cls, y_pred_cls, average='macro')

# Weighted = ถ่วงตามสัดส่วนคลาส (ถ้า class imbalance)
precision_weighted = precision_score(y_true_cls, y_pred_cls, average='weighted')
recall_weighted    = recall_score(y_true_cls, y_pred_cls, average='weighted')
f1_weighted        = f1_score(y_true_cls, y_pred_cls, average='weighted')

print("\n--- Classification Metrics ---")
print(f"Precision (macro)   : {precision_macro:.4f}")
print(f"Recall    (macro)   : {recall_macro:.4f}")
print(f"F1-score  (macro)   : {f1_macro:.4f}")

print(f"\nPrecision (weighted): {precision_weighted:.4f}")
print(f"Recall    (weighted): {recall_weighted:.4f}")
print(f"F1-score  (weighted): {f1_weighted:.4f}")


## R-score 

In [None]:
y_true_raw = [pred.r_ui for pred in predictions]
y_pred_raw = [pred.est for pred in predictions]

y_pred_rounded = [np.round(pred.est * 2) / 2 for pred in predictions]

# 2. คำนวณ R2 Score
r2 = r2_score(y_true_raw, y_pred_raw)
r2Rounded = r2_score(y_true_raw, y_pred_rounded)


print(f"R2 Score: {r2:.4f}")
print(f"R2 Score Rounded: {r2Rounded:.4f}")