In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Máy học/Ex1/annonimized.csv')
df_score = pd.read_csv('/content/drive/MyDrive/Máy học/Ex1/th-public.csv')

In [5]:
import pandas as pd
import numpy as np
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score

In [6]:
df_score["TH"] = df_score["TH"].astype(str).str.replace('\xa0', '').str.strip()
df_score["TH"] = pd.to_numeric(df_score["TH"], errors='coerce')

In [7]:
# Đổi tên cột cho dễ xử lý
df.columns = [
    'assignment_id',
    'problem_id',
    'username',
    'is_final',
    'status',
    'pre_score',
    'coefficient',
    'language_id',
    'created_at',
    'updated_at',
    'judgement'
]

In [8]:
# === BƯỚC 2: TRÍCH ĐẶC TRƯNG TỪ judgement ===
def extract_judgement_features(judgement_str):
    try:
        j = json.loads(judgement_str)
        times = j.get("times", [])
        mems = j.get("mems", [])
        verdicts = j.get("verdicts", {})
        total = len(times)
        wrong = verdicts.get("WRONG", 0)
        correct = total - wrong
        return pd.Series({
            "total_tests": total,
            "correct_tests": correct,
            "correct_rate": correct / total if total > 0 else 0,
            "avg_time": np.mean(times) if times else 0,
            "avg_mem": np.mean(mems) if mems else 0,
            "wrong_tests": wrong
        })
    except:
        return pd.Series({
            "total_tests": 0,
            "correct_tests": 0,
            "correct_rate": 0,
            "avg_time": 0,
            "avg_mem": 0,
            "wrong_tests": 0
        })

judgement_features = df["judgement"].apply(extract_judgement_features)
df = pd.concat([df, judgement_features], axis=1)

In [9]:
# === BƯỚC 3: XỬ LÝ CÁC CỘT CƠ BẢN ===
df["created_at"] = pd.to_datetime("2025-" + df["created_at"], format="%Y-%m-%d %H:%M:%S", errors='coerce')
df["is_correct"] = (df["pre_score"] == 10000).astype(int)
df["is_late"] = (df["coefficient"] < 100).astype(int)
df["is_scored"] = (df["status"] == "SCORE").astype(int)
df["day"] = df["created_at"].dt.date

In [10]:
# === BƯỚC 4: GOM ĐẶC TRƯNG THEO SINH VIÊN ===
feature_df = df.groupby("username").agg({
    "assignment_id": "nunique",
    "problem_id": "nunique",
    "pre_score": ["count", "mean", "max"],
    "coefficient": "mean",
    "wrong_tests": "mean",
    "is_correct": "sum",
    "is_late": "sum",
    "is_scored": "sum",
    "day": "nunique",
    "correct_rate": "mean",
    "avg_time": "mean",
    "avg_mem": "mean"
})

# Làm phẳng MultiIndex
feature_df.columns = [
    "num_assignments", "num_problems", "num_submissions",
    "avg_score", "max_score", "avg_penalty", "avg_wrong_tests",
    "num_correct", "num_late", "num_score_status", "active_days",
    "mean_correct_rate", "mean_time_per_test", "mean_mem_per_test"
]

# Đặc trưng tỉ lệ
feature_df["score_ratio"] = feature_df["num_score_status"] / feature_df["num_submissions"]
feature_df["correct_ratio"] = feature_df["num_correct"] / feature_df["num_submissions"]
feature_df["late_ratio"] = feature_df["num_late"] / feature_df["num_submissions"]

feature_df.reset_index(inplace=True)

In [11]:
# === BƯỚC 5: GHÉP ĐIỂM THỰC ===
df_score = df_score.rename(columns={"hash": "username"})
merged = feature_df.merge(df_score, on="username", how="left")

In [12]:

# === BƯỚC 6: TÁCH TẬP TRAIN/TEST ===
train_data = merged[~merged["TH"].isna()].copy()
test_data = merged[merged["TH"].isna()].copy()

train_data["TH"] = train_data["TH"].astype(float)

X_cols = [
    "num_assignments", "num_problems", "num_submissions",
    "avg_score", "max_score", "avg_penalty", "avg_wrong_tests",
    "num_correct", "num_late", "num_score_status", "active_days",
    "score_ratio", "correct_ratio", "late_ratio",
    "mean_correct_rate", "mean_time_per_test", "mean_mem_per_test"
]

X_train = train_data[X_cols]
y_train = train_data["TH"]
X_test = test_data[X_cols]

# === BƯỚC 7: RANDOM FOREST + GRIDSEARCHCV ===
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [8, 10, 12],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best params:", grid_search.best_params_)
print("Best R^2 score (CV):", grid_search.best_score_)

best_model = grid_search.best_estimator_



Best params: {'max_depth': 12, 'min_samples_split': 10, 'n_estimators': 200}
Best R^2 score (CV): 0.35425448090327144


In [14]:
# === BƯỚC 8: DỰ ĐOÁN & XUẤT FILE ===
test_data["predicted_TH"] = best_model.predict(X_test)
submission = test_data[["username", "predicted_TH"]]
submission.to_csv("SubEx1 - 2nd.csv", index=False, header=False)