In [3]:
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import r2_score

# === Đọc dữ liệu ===
df = pd.read_csv('/kaggle/input/data-diem/annonimized.csv')
df_score = pd.read_csv('/kaggle/input/data-diem/th-public.csv')

df_score["TH"] = df_score["TH"].astype(str).str.replace('\xa0', '').str.strip()
df_score["TH"] = pd.to_numeric(df_score["TH"], errors='coerce')

df.columns = [
    'assignment_id',
    'problem_id',
    'username',
    'is_final',
    'status',
    'pre_score',
    'coefficient',
    'language_id',
    'created_at',
    'updated_at',
    'judgement'
]

def extract_judgement_features(judgement_str):
    try:
        j = json.loads(judgement_str)
        times = j.get("times", [])
        mems = j.get("mems", [])
        verdicts = j.get("verdicts", {})
        total = len(times)
        wrong = verdicts.get("WRONG", 0)
        correct = total - wrong
        return pd.Series({
            "total_tests": total,
            "correct_tests": correct,
            "correct_rate": correct / total if total > 0 else 0,
            "avg_time": np.mean(times) if times else 0,
            "avg_mem": np.mean(mems) if mems else 0,
            "wrong_tests": wrong
        })
    except:
        return pd.Series({
            "total_tests": 0,
            "correct_tests": 0,
            "correct_rate": 0,
            "avg_time": 0,
            "avg_mem": 0,
            "wrong_tests": 0
        })

judgement_features = df["judgement"].apply(extract_judgement_features)
df = pd.concat([df, judgement_features], axis=1)

df["created_at"] = pd.to_datetime("2025-" + df["created_at"], format="%Y-%m-%d %H:%M:%S", errors='coerce')
df["is_correct"] = (df["pre_score"] == 10000).astype(int)
df["is_late"] = (df["coefficient"] < 100).astype(int)
df["is_scored"] = (df["status"] == "SCORE").astype(int)
df["day"] = df["created_at"].dt.date

feature_df = df.groupby("username").agg({
    "assignment_id": "nunique",
    "problem_id": "nunique",
    "pre_score": ["count", "mean", "max"],
    "coefficient": "mean",
    "wrong_tests": "mean",
    "is_correct": "sum",
    "is_late": "sum",
    "is_scored": "sum",
    "day": "nunique",
    "correct_rate": "mean",
    "avg_time": "mean",
    "avg_mem": "mean"
})

feature_df.columns = [
    "num_assignments", "num_problems", "num_submissions",
    "avg_score", "max_score", "avg_penalty", "avg_wrong_tests",
    "num_correct", "num_late", "num_score_status", "active_days",
    "mean_correct_rate", "mean_time_per_test", "mean_mem_per_test"
]

feature_df["score_ratio"] = feature_df["num_score_status"] / feature_df["num_submissions"]
feature_df["correct_ratio"] = feature_df["num_correct"] / feature_df["num_submissions"]
feature_df["late_ratio"] = feature_df["num_late"] / feature_df["num_submissions"]

feature_df.reset_index(inplace=True)

df_score = df_score.rename(columns={"hash": "username"})
merged = feature_df.merge(df_score, on="username", how="left")

train_data = merged[~merged["TH"].isna()].copy()
test_data = merged[merged["TH"].isna()].copy()
train_data["TH"] = train_data["TH"].astype(float)

X_cols = [
    "num_assignments", "num_problems", "num_submissions",
    "avg_score", "max_score", "avg_penalty", "avg_wrong_tests",
    "num_correct", "num_late", "num_score_status", "active_days",
    "score_ratio", "correct_ratio", "late_ratio",
    "mean_correct_rate", "mean_time_per_test", "mean_mem_per_test"
]

X_train = train_data[X_cols]
y_train = train_data["TH"]

# === Random Forest với GridSearchCV ===
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [8, 10, 12],
    'min_samples_split': [2, 5, 10]
}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# === Linear Regression ===
lr = LinearRegression()
lr_score = np.mean(cross_val_score(lr, X_train, y_train, cv=3, scoring='r2'))

# === GridSearchCV cho SVR ===
param_grid_svr = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1.0, 10],
    'epsilon': [0.1, 0.2, 0.5]
}
grid_search_svr = GridSearchCV(SVR(), param_grid_svr, cv=3, scoring='r2', n_jobs=-1)
grid_search_svr.fit(X_train, y_train)

# === In kết quả ===
print("=== So sánh các mô hình (R^2 CV score) ===")
print(f"Random Forest (best): {grid_search_rf.best_score_:.4f} với params {grid_search_rf.best_params_}")
print(f"Linear Regression: {lr_score:.4f}")
print(f"SVR (best): {grid_search_svr.best_score_:.4f} với params {grid_search_svr.best_params_}")

# Lấy best_model từ Random Forest
best_model = grid_search_rf.best_estimator_


=== So sánh các mô hình (R^2 CV score) ===
Random Forest (best): 0.3543 với params {'max_depth': 12, 'min_samples_split': 10, 'n_estimators': 200}
Linear Regression: 0.2677
SVR (best): 0.1926 với params {'C': 10, 'epsilon': 0.5, 'kernel': 'rbf'}
