In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import joblib
import datetime

In [9]:
# ==========================
# 1. Import Libraries
# ==========================
import pandas as pd
import numpy as np
import datetime
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Models
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier, MLPRegressor

# Metrics
from sklearn.metrics import (
    accuracy_score, classification_report,
    mean_squared_error, r2_score
)

# ==========================
# 2. Load Dataset
# ==========================
data = pd.read_csv("Student_Performance.csv")  # đổi file tại đây

# Drop NaN chỉ trong target
data = data.dropna(subset=[data.columns[-1]])
target_col = data.columns[-1]
X = data.drop(columns=[target_col])
y = data[target_col]

# Convert target nếu là số nhưng bị lưu dạng object
if y.dtype == 'object':
    try:
        y = pd.to_numeric(y, errors='ignore')
    except Exception:
        pass

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# 3. Preprocessing
# ==========================
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True, max_categories=50))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    sparse_threshold=0.3
)

# ==========================
# 4. Train Model Functions
# ==========================

# --- Classification Models ---
def train_random_forest_classifier(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=50, max_depth=10, 
            max_features="sqrt", random_state=42, n_jobs=-1))
    ]).fit(X_train, y_train)

def train_svm_classifier(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', SVC())
    ]).fit(X_train, y_train)

def train_logistic_regression(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(max_iter=500))
    ]).fit(X_train, y_train)

def train_knn_classifier(X_train, y_train, preprocessor, n_neighbors=5):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', KNeighborsClassifier(n_neighbors=n_neighbors))
    ]).fit(X_train, y_train)

def train_decision_tree_classifier(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(max_depth=10, random_state=42))
    ]).fit(X_train, y_train)

def train_naive_bayes_classifier(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', GaussianNB())
    ]).fit(X_train, y_train)

def train_mlp_classifier(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
    ]).fit(X_train, y_train)


# --- Regression Models ---
def train_random_forest_regressor(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=50, max_depth=10,
            max_features="sqrt", random_state=42, n_jobs=-1))
    ]).fit(X_train, y_train)

def train_svm_regressor(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', SVR())
    ]).fit(X_train, y_train)

def train_linear_regression(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ]).fit(X_train, y_train)

def train_knn_regressor(X_train, y_train, preprocessor, n_neighbors=5):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', KNeighborsRegressor(n_neighbors=n_neighbors))
    ]).fit(X_train, y_train)

def train_decision_tree_regressor(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', DecisionTreeRegressor(max_depth=10, random_state=42))
    ]).fit(X_train, y_train)

def train_mlp_regressor(X_train, y_train, preprocessor):
    return Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', MLPRegressor(hidden_layer_sizes=(100,), max_iter=300, random_state=42))
    ]).fit(X_train, y_train)


# ==========================
# 5. Evaluate Functions
# ==========================
def evaluate_classification(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

def evaluate_regression(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score:", r2_score(y_test, y_pred))

# ==========================
# 6. Evaluate All Models
# ==========================
def evaluate_all_models(X_train, y_train, X_test, y_test, preprocessor, task="classification"):
    models = {}
    if task == "classification":
        models = {
            "RandomForest": train_random_forest_classifier,
            "SVM": train_svm_classifier,
            "LogisticRegression": train_logistic_regression,
            "KNN": train_knn_classifier,
            "DecisionTree": train_decision_tree_classifier,
            "NaiveBayes": train_naive_bayes_classifier,
            "MLP": train_mlp_classifier
        }
    else:
        models = {
            "RandomForest": train_random_forest_regressor,
            "SVM": train_svm_regressor,
            "LinearRegression": train_linear_regression,
            "KNN": train_knn_regressor,
            "DecisionTree": train_decision_tree_regressor,
            "MLP": train_mlp_regressor
        }

    for name, trainer in models.items():
        print("="*40)
        print(f"🔹 {name}")
        try:
            model = trainer(X_train, y_train, preprocessor)
            if task == "classification":
                evaluate_classification(model, X_test, y_test)
            else:
                evaluate_regression(model, X_test, y_test)
        except Exception as e:
            print(f"Error in {name}: {e}")

# ==========================
# 7. Main Logic
# ==========================
if y.dtype == 'object':  
    print("Target là dạng phân loại (classification).")
    evaluate_all_models(X_train, y_train, X_test, y_test, preprocessor, task="classification")
else:
    print("Target là dạng số (regression).")
    evaluate_all_models(X_train, y_train, X_test, y_test, preprocessor, task="regression")


Target là dạng số (regression).
🔹 RandomForest
RMSE: 2.6685331127821983
R2 Score: 0.980784255312577
🔹 SVM
RMSE: 2.3207751929970395
R2 Score: 0.9854662336292253
🔹 LinearRegression
RMSE: 2.020551508505045
R2 Score: 0.9889832909573141
🔹 KNN
RMSE: 2.913866846648968
R2 Score: 0.9770886103261632
🔹 DecisionTree
RMSE: 2.528574526666764
R2 Score: 0.9827470428226587
🔹 MLP
RMSE: 2.030836644763437
R2 Score: 0.9888708496384566


In [10]:
clf = train_knn_regressor(X_train, y_train, preprocessor)
evaluate_regression(clf, X_test, y_test)

RMSE: 2.913866846648968
R2 Score: 0.9770886103261632


In [None]:
# Thông tin sinh viên / nhóm
Lop   = "ML2025"
Nhom  = "11"
MSSV  = "23714291"
HoTen = "Nguyen Van A"
SoMay = "01"

metadata = {
    "Lop": Lop,
    "Nhom": Nhom,
    "MSSV": MSSV,
    "HoTen": HoTen,
    "SoMay": SoMay,
    "TaoLuc": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

save_obj = {
    "model": model,
    "metadata": metadata
}

joblib.dump(save_obj, "model.pkl")
print(" Model and metadata saved to model.pkl")
