In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE

# Đọc dữ liệu
df = pd.read_csv(r'D:\MINI_Project\DoAn3\perdict_train_XGBoost\students_grading_dataset_clean.csv')

# Loại bỏ các cột không cần thiết
df.drop(['Student_ID','First_Name','Last_Name','Email','Midterm_Score',
         'Final_Score','Quizzes_Avg','Projects_Score','Total_Score'],
        axis=1, inplace=True)

# Encode nhãn Grade
df['Grade'] = df['Grade'].map({'A': 0, 'B': 1, 'C': 2, 'D': 3})

# Chia X, y
X = df.drop('Grade', axis=1)
y = df['Grade']

# Cột categorical
cat_features = ['Gender', 'Department', 'Extracurricular_Activities',
                'Internet_Access_at_Home', 'Parent_Education_Level', 'Family_Income_Level']

# One-hot encode categorical để dùng với SMOTE
X_encoded = pd.get_dummies(X, drop_first=True)

# 👉 Áp dụng SMOTE để cân bằng nhãn
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Chia train/test từ dữ liệu đã cân bằng
X_train_enc, X_test_enc, y_train, y_test = train_test_split(X_resampled, y_resampled,
                                                            test_size=0.2, stratify=y_resampled, random_state=42)

# Khôi phục lại bản gốc X_train để dùng CatBoost (vì CatBoost hỗ trợ categorical trực tiếp)
X_resampled_full = pd.concat([X]*y.value_counts().max(), ignore_index=True)
X_train = X_resampled_full.iloc[X_train_enc.index]
X_test = X_resampled_full.iloc[X_test_enc.index]

# Train Pool (CatBoost hỗ trợ categorical)
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# Khởi tạo và huấn luyện CatBoost
model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool)

# Dự đoán
y_pred = model.predict(X_test)
from sklearn.metrics import classification_report, accuracy_score, f1_score
# 🎯 Evaluation
#print("\nClassification Report:\n")
#print(classification_report(y_test, y_pred))
f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)



0:	learn: 0.3041736	test: 0.2923899	best: 0.2923899 (0)	total: 117ms	remaining: 58.3s
100:	learn: 0.3499165	test: 0.2759235	best: 0.2946150 (4)	total: 7.16s	remaining: 28.3s
200:	learn: 0.3796327	test: 0.2701380	best: 0.2946150 (4)	total: 13.7s	remaining: 20.4s
300:	learn: 0.4272677	test: 0.2478861	best: 0.2946150 (4)	total: 20.9s	remaining: 13.8s
400:	learn: 0.4706733	test: 0.2291945	best: 0.2946150 (4)	total: 28.3s	remaining: 6.98s
499:	learn: 0.5008347	test: 0.2265243	best: 0.2946150 (4)	total: 36.1s	remaining: 0us

bestTest = 0.2946150423
bestIteration = 4

Shrink model to first 5 iterations.
0.2154712543614128
