In [1]:
import sys
import os

# Thêm thư mục gốc vào sys.path
BASE_DIR = os.path.abspath("..")  # Lấy thư mục cha của thư mục hiện tại
sys.path.append(BASE_DIR)

from save import load_model

In [3]:
model_name= "random_forest"
models=[]
for fold in range(1,6):
    model=load_model(model_name,fold, directory="best_models")
    models.append(model)
    


Loading model from: best_models\random_forest\fold_1\model.pkl
Loading model from: best_models\random_forest\fold_2\model.pkl
Loading model from: best_models\random_forest\fold_3\model.pkl
Loading model from: best_models\random_forest\fold_4\model.pkl
Loading model from: best_models\random_forest\fold_5\model.pkl


In [4]:
models

[RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=3,
                       n_estimators=200, random_state=42),
 RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=3,
                       n_estimators=300, random_state=42),
 RandomForestRegressor(max_features='sqrt', min_samples_leaf=3, n_estimators=200,
                       random_state=42),
 RandomForestRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=3,
                       n_estimators=200, random_state=42),
 RandomForestRegressor(max_features='sqrt', min_samples_leaf=3, n_estimators=200,
                       random_state=42)]

In [4]:
import pandas as pd

test_path="dataset/test.csv"
test_df = pd.read_csv(test_path)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   genres           200 non-null    object 
 1   rating           200 non-null    float64
 2   no_of_votes      200 non-null    int64  
 3   meta_score       200 non-null    float64
 4   release_date     200 non-null    float64
 5   gross            200 non-null    float64
 6   budget           200 non-null    float64
 7   countries        200 non-null    object 
 8   log_budget       200 non-null    float64
 9   log_no_of_votes  200 non-null    float64
 10  log_gross        200 non-null    float64
 11  log_gross_bin    200 non-null    int64  
dtypes: float64(8), int64(2), object(2)
memory usage: 18.9+ KB


In [None]:
test_df.head()

In [None]:
from settings import TARGET,FEATURES
from data import load_data_test
import numpy as np
results = []

model_dir = "best_models/random_forest"
for model in models:
    X_test, y_test =  load_data_test(
    df=test_df,
    folder_path= model_dir,
    fold=1,
    target=TARGET,
    features=FEATURES
)
    # Dự đoán với mô hình
    predictions = model.predict(X_test)
    results.append(predictions)
# Tính trung bình dự đoán từ 5 mô hình
results = np.array(results)  # Chuyển danh sách sang NumPy array (shape: [5, num_samples])
average_predictions = np.mean(results, axis=0)  # Trung bình trên trục 0 (5 mô hình)
average_predictions 

In [7]:
from data import _split_column
from data import _apply_target_encoding
import joblib
import numpy as np
import json

def process_test_data(df_test: pd.DataFrame, features, encoding_dir: str):
    import json

    # Tách genres và countries
    df_test['genres_list'] = _split_column(df_test, 'genres')
    df_test['countries_list'] = _split_column(df_test, 'countries')

    # Load encoding từ file json
    with open(os.path.join(encoding_dir, "genre_encoded.json"), encoding='utf-8') as f:
        genre_encoding = json.load(f)

    with open(os.path.join(encoding_dir, "country_encoded.json"), encoding='utf-8') as f:
        country_encoding = json.load(f)

    # Áp dụng encoding để tạo đặc trưng thống kê
    _apply_target_encoding(df_test, df_test['genres_list'], genre_encoding, 'genre_stat_feature')
    _apply_target_encoding(df_test, df_test['countries_list'], country_encoding, 'country_stat_feature')

    # Log transform các đặc trưng
    for col in ['country_stat_feature', 'genre_stat_feature']:
        df_test[f'log_{col}'] = np.log1p(df_test[f"{col}"])

    # Log transform các cột số khác trong FEATURES
    df_test['log_no_of_votes'] = np.log1p(df_test['no_of_votes'])
    df_test['log_budget'] = np.log1p(df_test['budget'])

    # Lưu lại cột 'gross' trước khi chỉ lấy các cột trong FEATURES
    y_test = df_test['gross'].values

    # Chỉ lấy các cột cần thiết trong FEATURES
    df_test = df_test[features]

    # Kiểm tra xem có đủ các cột trong FEATURES không
    missing_cols = [col for col in features if col not in df_test.columns]
    if missing_cols:
        raise KeyError(f"Missing columns in DataFrame: {missing_cols}")

    # Load scaler
    scaler_path = os.path.join(encoding_dir, "scaler.pkl")
    scaler = joblib.load(scaler_path)

    # Chuẩn hóa dữ liệu test
    X_test = scaler.transform(df_test.values)

    return X_test, y_test


In [8]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
fold_scores = []

for fold in range(1, 6):
    print(f"\nTesting with model from Fold {fold}")

    fold_dir = f"best_models/random_forest/fold_{fold}"
    model_path = os.path.join(fold_dir, "model.pkl")

    if not os.path.exists(model_path):
        print(f" Missing model for fold {fold}")
        continue

    # Load model
    model = joblib.load(model_path)

    # Xử lý dữ liệu test
    fold_path = f"best_models/random_forest"
    X_test, y_test = load_data_test(test_df, features=FEATURES, folder_path=fold_path, fold=fold, target="gross")

    # Dự đoán và đánh giá
    log_pred = model.predict(X_test)
    y_pred = np.expm1(log_pred)  # Chuyển đổi về giá trị gốc

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    print(f"Fold {fold} - R2: {r2:.4f} | MAE: {mae:.4f} | MAPE: {mape:.4f}")
    fold_scores.append((fold, r2, mae, mape))


Testing with model from Fold 1
Fold 1 - R2: 0.5418 | MAE: 67535090.2526 | MAPE: 25.0793

Testing with model from Fold 2
Fold 2 - R2: 0.5161 | MAE: 70531052.5848 | MAPE: 16.6051

Testing with model from Fold 3
Fold 3 - R2: 0.5027 | MAE: 70329495.3870 | MAPE: 16.6384

Testing with model from Fold 4
Fold 4 - R2: 0.5199 | MAE: 68833532.5746 | MAPE: 10.7379

Testing with model from Fold 5
Fold 5 - R2: 0.5597 | MAE: 67458207.8109 | MAPE: 18.1433


In [9]:
# Trung bình kết quả các fold
r2_avg = np.mean([s[1] for s in fold_scores])
mae_avg = np.mean([s[2] for s in fold_scores])
mape_avg = np.mean([s[3] for s in fold_scores])

print(f"\n📈 AVERAGE over folds - R2: {r2_avg:.4f} | MAE: {mae_avg:.4f} | MAPE: {mape_avg:.4f}")


📈 AVERAGE over folds - R2: 0.5280 | MAE: 68937475.7220 | MAPE: 17.4408


In [12]:
all_preds = []
y_test = None  # Chỉ cần lấy y_test một lần

for fold in range(1, 6):
    print(f"\n🔍 Loading model and encoding from Fold {fold}")

    fold_dir = f"best_models/random_forest/fold_{fold}"
    model_path = os.path.join(fold_dir, "model.pkl")

    if not os.path.exists(model_path):
        print(f"⚠️ Missing model for fold {fold}")
        continue

    model = joblib.load(model_path)

    # Xử lý dữ liệu test (sử dụng encoding + scaler của từng fold)
    fold_path = f"best_models/random_forest"
    X_test_fold, y_test_fold = load_data_test(test_df, features=FEATURES, folder_path=fold_path, fold=fold, target="gross")

    # Chỉ cần lưu y_test một lần (giống nhau cho mọi fold)
    if y_test is None:
        y_test = y_test_fold

    # Dự đoán và lưu lại
    log_pred = model.predict(X_test_fold)  # Sử dụng X_test_fold thay vì X_test
    y_pred_fold = np.expm1(log_pred)  # Chuyển đổi về giá trị gốc

    all_preds.append(y_pred_fold)

# Trung bình dự đoán từ tất cả mô hình
y_pred_avg = np.mean(all_preds, axis=0)

# Đánh giá
r2 = r2_score(y_test, y_pred_avg)
mae = mean_absolute_error(y_test, y_pred_avg)
mape = mean_absolute_percentage_error(y_test, y_pred_avg)

print(f"\n📊 Ensemble Results (Average of 5 folds):")
print(f"🔹 R2: {r2:.4f} | MAE: {mae:.4f} | MAPE: {mape:.4f}")



🔍 Loading model and encoding from Fold 1

🔍 Loading model and encoding from Fold 2

🔍 Loading model and encoding from Fold 3

🔍 Loading model and encoding from Fold 4

🔍 Loading model and encoding from Fold 5

📊 Ensemble Results (Average of 5 folds):
🔹 R2: 0.5332 | MAE: 68472310.5415 | MAPE: 17.4295
