In [1]:
# traffic_sign_random_forest.py
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# ------------------------------------------------------------------
# 1.  Load metadata (image_id + class label) and all feature tables
# ------------------------------------------------------------------

BASE_PATH = '../2025_A2/train'
METADATA_PATH = BASE_PATH + '/train_metadata.csv'
train_meta = pd.read_csv(METADATA_PATH)   # columns: image_id, ClassId
# test_meta  = pd.read_csv("test_metadata.csv")    # columns: image_id

FEATURE_PATH = BASE_PATH + '/Features'

print(FEATURE_PATH + "/additional_features.csv")
add_feats  = pd.read_csv(FEATURE_PATH + "/additional_features.csv")  # edge_density, mean_b/g/r …
color_hist = pd.read_csv(FEATURE_PATH + "/color_histogram.csv")      # 95 colour-histogram bins
hog_pca    = pd.read_csv(FEATURE_PATH + "/hog_pca.csv")              # 19 PCA-compressed HOG dims

../2025_A2/train/Features/additional_features.csv


In [2]:
# ------------------------------------------------------------------
# 2.  Merge everything on the common key (image_path)
# ------------------------------------------------------------------
def merge(meta):
    return (meta
            .merge(add_feats,  on="image_path")
            .merge(color_hist, on="image_path")
            .merge(hog_pca,    on="image_path"))

train_df = merge(train_meta)
# test_df  = merge(test_meta)

X = train_df.drop(columns=["ClassId", "image_path"])
y = train_df["ClassId"]

In [3]:
# ------------------------------------------------------------------
# 3.  Build a pipeline:  StandardScaler  →  RandomForest
#     (scaling isn’t critical for RF but doesn’t hurt mixed features)
# ------------------------------------------------------------------
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("rf", RandomForestClassifier(
            n_estimators=500,
            max_depth=None,
            n_jobs=-1,
            random_state=42))
])

# Optional: quick hyper-parameter search (takes a few minutes on CPU)
param_grid = {
    "rf__n_estimators": [300, 500, 800],
    "rf__max_depth"   : [None, 20, 40],
    "rf__max_features": ["sqrt", "log2"]
}
grid = GridSearchCV(pipe,
                    param_grid,
                    cv=5,
                    n_jobs=-1,
                    scoring="accuracy",
                    verbose=1)
grid.fit(X, y)

print("Best CV accuracy :", grid.best_score_)
print("Best parameters  :", grid.best_params_)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best CV accuracy : 0.8079445017293395
Best parameters  : {'rf__max_depth': None, 'rf__max_features': 'sqrt', 'rf__n_estimators': 800}


In [4]:
# ------------------------------------------------------------------
# 4.  Fit best model on ALL labelled data and predict the test set
# ------------------------------------------------------------------
best_model = grid.best_estimator_
best_model.fit(X, y)

# test_features = test_df.drop(columns=["image_id"])
# test_preds = best_model.predict(test_features)

# pd.DataFrame({
#     # "image_id": test_df["image_id"],
#     "ClassId" : test_preds
# }).to_csv("submission.csv", index=False)

print("✅  submission.csv written – upload this on Kaggle")

✅  submission.csv written – upload this on Kaggle


In [8]:
import joblib

# 保存整个 pipeline（包括 StandardScaler + RF）
joblib.dump(best_model, '../models/random_forest.pkl')

['../models/random_forest.pkl']