In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
import joblib
import datetime

In [5]:
# Load dataset
data = pd.read_csv("dataset_train.csv")  # đổi thành file của bạn

# ==========================
# 2. Split Features and Target
# ==========================
data = data.dropna(subset=[data.columns[-1]])   # chỉ drop NaN trong target
target_col = data.columns[-1]   # lấy cột cuối cùng làm nhãn
X = data.drop(columns=[target_col])
y = data[target_col]

# Nếu là số nhưng bị lưu dạng string thì convert
if y.dtype == 'object':
    try:
        y = pd.to_numeric(y, errors='ignore')
    except:
        pass

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ==========================
# 3. Preprocessing
# ==========================
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True, max_categories=50))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    sparse_threshold=0.3
)

# ==========================
# 4. Train Model Functions
# ==========================
def train_random_forest_classifier(X_train, y_train, preprocessor):
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(
            n_estimators=50, 
            max_depth=10, 
            max_features="sqrt", 
            random_state=42, 
            n_jobs=-1
        ))
    ])
    clf.fit(X_train, y_train)
    return clf

def train_svm_classifier(X_train, y_train, preprocessor):
    clf = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', SVC())
    ])
    clf.fit(X_train, y_train)
    return clf

def train_linear_regression(X_train, y_train, preprocessor):
    reg = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    reg.fit(X_train, y_train)
    return reg

def train_random_forest_regressor(X_train, y_train, preprocessor):
    reg = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(
            n_estimators=50, 
            max_depth=10, 
            max_features="sqrt", 
            random_state=42, 
            n_jobs=-1
        ))
    ])
    reg.fit(X_train, y_train)
    return reg

def train_svm_regressor(X_train, y_train, preprocessor):
    reg = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', SVR())
    ])
    reg.fit(X_train, y_train)
    return reg

# ==========================
# 5. Evaluate Models
# ==========================
def evaluate_classification(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

def evaluate_regression(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R2 Score:", r2_score(y_test, y_pred))

# ==========================
# 6. Main Logic
# ==========================
if y.dtype == 'object':  # Classification
    print("Target là dạng phân loại (classification).")
    clf = train_random_forest_classifier(X_train, y_train, preprocessor)
    evaluate_classification(clf, X_test, y_test)
else:  # Regression
    print("Target là dạng số (regression).")
    reg = train_random_forest_regressor(X_train, y_train, preprocessor)
    evaluate_regression(reg, X_test, y_test)


  y = pd.to_numeric(y, errors='ignore')


Target là dạng phân loại (classification).
Accuracy: 0.08642327438676554
Classification Report:
               precision    recall  f1-score   support

           A       0.00      0.00      0.00        30
       A+...       0.00      0.00      0.00         2
        A...       0.00      0.00      0.00         1
      A/Fe-K       0.00      0.00      0.00         1
          A0       0.29      0.01      0.03       708
      A0+...       0.00      0.00      0.00         2
       A0...       0.00      0.00      0.00         6
     A0/A1IV       0.00      0.00      0.00         1
   A0/A1IV/V       0.00      0.00      0.00         2
 A0/A1IVp...       0.00      0.00      0.00         1
      A0/A1V       0.00      0.00      0.00        18
     A0/A1V:       0.00      0.00      0.00         1
        A0II       0.00      0.00      0.00         2
    A0II/III       0.00      0.00      0.00         1
       A0III       0.00      0.00      0.00         8
    A0III SB       0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [None]:
clf = train_svm_classifier(X_train, y_train, preprocessor)
evaluate_classification(clf, X_test, y_test)

In [None]:
# Thông tin sinh viên / nhóm
Lop   = "ML2025"
Nhom  = "11"
MSSV  = "23714291"
HoTen = "Nguyen Van A"
SoMay = "01"

metadata = {
    "Lop": Lop,
    "Nhom": Nhom,
    "MSSV": MSSV,
    "HoTen": HoTen,
    "SoMay": SoMay,
    "TaoLuc": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

save_obj = {
    "model": model,
    "metadata": metadata
}

joblib.dump(save_obj, "model.pkl")
print(" Model and metadata saved to model.pkl")
