In [1]:
import os
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
_HAS_XGB, _HAS_CAT = False, False
try:
    from xgboost import XGBClassifier
    _HAS_XGB = True
except Exception:
    pass
try:
    from catboost import CatBoostClassifier
    _HAS_CAT = True
except Exception:
    pass

In [2]:
def make_models():
    models = {
        "rf": RandomForestClassifier(n_estimators=400, min_samples_leaf=2, random_state=42, n_jobs=-1),
        "extratrees": ExtraTreesClassifier(n_estimators=600, min_samples_leaf=2, random_state=42, n_jobs=-1),
        "gbt": GradientBoostingClassifier(random_state=42),
        "svm": SVC(kernel="rbf", probability=True, random_state=42),
        "logreg": LogisticRegression(max_iter=5000, solver="lbfgs"),
    }
    if _HAS_XGB:
        models["xgb"] = XGBClassifier(
            n_estimators=800, max_depth=6, learning_rate=0.05,
            subsample=0.9, colsample_bytree=0.9, objective="multi:softprob",
            tree_method="hist", random_state=42, n_jobs=-1
        )
    if _HAS_CAT:
        models["catboost"] = CatBoostClassifier(
            iterations=800, depth=8, learning_rate=0.05,
            loss_function="MultiClass", random_seed=42, verbose=100
        )
    return models

In [3]:
def add_volatility_and_numeric_label(df: pd.DataFrame, vol_window: int = 21) -> pd.DataFrame:
    df = df.copy()
    if "return" not in df.columns or df["return"].isna().all():
        df["return"] = df.groupby("ticker")["close"].pct_change()
    df["volatility"] = df.groupby("ticker")["return"].transform(
        lambda s: s.rolling(vol_window).std() * np.sqrt(252)
    )
    sub = df.dropna(subset=["volatility"])
    if sub.empty:
        raise ValueError("Không có dữ liệu đủ để tính volatility.")
    q = sub["volatility"].quantile([0.33, 0.66])
    q33, q66 = q.loc[0.33], q.loc[0.66]
    df["risk_label"] = np.select(
        [
            (df["volatility"] > q66),
            (df["volatility"] <= q66) & (df["volatility"] > q33),
            (df["volatility"] <= q33),
        ],
        [2, 1, 0],
        default=1
    ).astype(int)
    return df

In [4]:
FEATURES = [
    "open","high","low","close","volume",
    "ema_50","ema_200","ema_gap",
    "macd","macd_signal","macd_diff",
    "rsi","mfi",
    "bollinger_hband","bollinger_lband","bollinger_pct",
    "return"
]
EXCLUDE_IF_PRESENT = ["volatility","bollinger_bw","golden_cross","death_cross","macd_cross"]

In [5]:
def load_excel_raw(file_path: str) -> pd.DataFrame:
    if not os.path.exists(file_path):
        raise FileNotFoundError(file_path)
    df = pd.read_excel(file_path)

    must_cols = ["ticker","timestamp","open","high","low","close","volume"]
    missing = [c for c in must_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Thiếu cột trong Excel: {missing}")

    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    df = df.dropna(subset=["timestamp"])
    for col in ["open","high","low","close","volume"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    df = df[(df["close"] > 0) & (df["volume"] >= 0)].copy()
    df = df.sort_values(["ticker","timestamp"]).reset_index(drop=True)
    return df
def finalize_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    if "ema_50" in df.columns and "ema_200" in df.columns and "ema_gap" not in df.columns:
        denom = df["ema_200"].replace(0, np.nan)
        df["ema_gap"] = (df["ema_50"] - df["ema_200"]) / denom

    for col in ["mfi","bollinger_pct","macd","macd_signal","macd_diff","rsi",
                "ema_50","ema_200","bollinger_hband","bollinger_lband","return"]:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    df = df.replace([np.inf, -np.inf], np.nan)
    return df

In [6]:
DEFAULT_TEST_SIZE = 0.2
DEFAULT_RANDOM_STATE = 42
def run_all_models_and_compare(file_path: str, model_dir: str, test_size: float = DEFAULT_TEST_SIZE,
                               random_state: int = DEFAULT_RANDOM_STATE) -> pd.DataFrame:
    os.makedirs(model_dir, exist_ok=True)
    df = load_excel_raw(file_path)
    df = finalize_features(df)
    df = add_volatility_and_numeric_label(df)
    feats = [f for f in FEATURES if f in df.columns and f not in EXCLUDE_IF_PRESENT]
    if not feats:
        raise ValueError("Không còn feature nào hợp lệ sau khi loại proxy volatility/cross.")
    df_xy = df.dropna(subset=feats + ["risk_label"]).copy()
    X = df_xy[feats]
    y = df_xy["risk_label"].astype(int)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    results = []
    models = make_models()
    print(f"Using features ({len(feats)}): {feats}")
    for name, model in models.items():
        print("\n" + "="*80)
        print(f"MODEL: {name}")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(classification_report(y_test, y_pred, digits=4))
        bundle = {"model": model, "features": feats, "model_name": name, "label_mode": "numeric_0_1_2"}
        save_path = os.path.join(model_dir, f"risk_model_{name}.pkl")
        joblib.dump(bundle, save_path)
        print(f"[Saved] {save_path}")
        results.append({
            "model": name,
            "test_acc": accuracy_score(y_test, y_pred),
            "test_macro_f1": f1_score(y_test, y_pred, average="macro"),
            "test_size": len(y_test)
        })
    res_df = pd.DataFrame(results).sort_values("test_macro_f1", ascending=False).reset_index(drop=True)
    print("\n" + "="*80)
    print("BẢNG SO SÁNH (sắp theo Macro-F1 giảm dần)")
    print("="*80)
    print(res_df.to_string(index=False))
    return res_df

In [8]:
if __name__ == "__main__":
    file_path = r'D:\Downloads\DSTC vòng 2\cleaned data\UPCOM_cleaned_last.xlsx'
    model_dir = 'model_output'
    results = run_all_models_and_compare(file_path, model_dir)

Using features (17): ['open', 'high', 'low', 'close', 'volume', 'ema_50', 'ema_200', 'ema_gap', 'macd', 'macd_signal', 'macd_diff', 'rsi', 'mfi', 'bollinger_hband', 'bollinger_lband', 'bollinger_pct', 'return']

MODEL: rf
              precision    recall  f1-score   support

           0     0.9381    0.9273    0.9327      1927
           1     0.8785    0.8937    0.8860      2127
           2     0.9381    0.9310    0.9345      1986

    accuracy                         0.9167      6040
   macro avg     0.9182    0.9174    0.9178      6040
weighted avg     0.9171    0.9167    0.9169      6040

[Saved] model_output\risk_model_rf.pkl

MODEL: extratrees
              precision    recall  f1-score   support

           0     0.9450    0.9455    0.9453      1927
           1     0.9016    0.9088    0.9052      2127
           2     0.9517    0.9431    0.9474      1986

    accuracy                         0.9318      6040
   macro avg     0.9328    0.9325    0.9326      6040
weighted avg 

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=5000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
