In [41]:
import os.path

from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error
from torch.utils.data import TensorDataset
from settings import FEATURES, BASE_DIR
from data import load_data_test

In [40]:
import pandas as pd
test_path = os.path.join(BASE_DIR, "dataset", "test.csv")
test_df = pd.read_csv(test_path)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   genres           200 non-null    object 
 1   rating           200 non-null    float64
 2   no_of_votes      200 non-null    int64  
 3   meta_score       200 non-null    float64
 4   release_date     200 non-null    float64
 5   gross            200 non-null    float64
 6   budget           200 non-null    float64
 7   countries        200 non-null    object 
 8   log_budget       200 non-null    float64
 9   log_no_of_votes  200 non-null    float64
 10  log_gross        200 non-null    float64
 11  log_gross_bin    200 non-null    int64  
dtypes: float64(8), int64(2), object(2)
memory usage: 18.9+ KB


In [31]:
from data import _split_column
from data import _apply_target_encoding
import joblib
import numpy as np
import json

def process_test_data(df_test: pd.DataFrame, features, encoding_dir: str):
    import json

    # T√°ch genres v√† countries
    df_test['genres_list'] = _split_column(df_test, 'genres')
    df_test['countries_list'] = _split_column(df_test, 'countries')

    # Load encoding t·ª´ file json
    with open(os.path.join(encoding_dir, "genre_encoded.json"), encoding='utf-8') as f:
        genre_encoding = json.load(f)

    with open(os.path.join(encoding_dir, "country_encoded.json"), encoding='utf-8') as f:
        country_encoding = json.load(f)

    # √Åp d·ª•ng encoding ƒë·ªÉ t·∫°o ƒë·∫∑c tr∆∞ng th·ªëng k√™
    _apply_target_encoding(df_test, df_test['genres_list'], genre_encoding, 'genre_stat_feature')
    _apply_target_encoding(df_test, df_test['countries_list'], country_encoding, 'country_stat_feature')

    # Log transform c√°c ƒë·∫∑c tr∆∞ng
    for col in ['country_stat_feature', 'genre_stat_feature']:
        df_test[f'log_{col}'] = np.log1p(df_test[f"{col}"])

    # Log transform c√°c c·ªôt s·ªë kh√°c trong FEATURES
    df_test['log_no_of_votes'] = np.log1p(df_test['no_of_votes'])
    df_test['log_budget'] = np.log1p(df_test['budget'])

    # L∆∞u l·∫°i c·ªôt 'gross' tr∆∞·ªõc khi ch·ªâ l·∫•y c√°c c·ªôt trong FEATURES
    y_test = df_test['gross'].values

    # Ch·ªâ l·∫•y c√°c c·ªôt c·∫ßn thi·∫øt trong FEATURES
    df_test = df_test[features]

    # Ki·ªÉm tra xem c√≥ ƒë·ªß c√°c c·ªôt trong FEATURES kh√¥ng
    missing_cols = [col for col in features if col not in df_test.columns]
    if missing_cols:
        raise KeyError(f"Missing columns in DataFrame: {missing_cols}")

    # Load scaler
    scaler_path = os.path.join(encoding_dir, "scaler.pkl")
    scaler = joblib.load(scaler_path)

    # Chu·∫©n h√≥a d·ªØ li·ªáu test
    X_test = scaler.transform(df_test.values)

    return X_test, y_test


In [47]:
fold_scores = []

for fold in range(1, 6):
    print(f"\nüîç Testing with model from Fold {fold}")

    fold_dir = f"best_models/GB_model/fold_{fold}"
    model_path = os.path.join(fold_dir, "model.pkl")

    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Missing model for fold {fold}")
        continue

    # Load model
    model = joblib.load(model_path)

    # X·ª≠ l√Ω d·ªØ li·ªáu test
    fold_path = f"best_models/GB_model"
    X_test, y_test = load_data_test(test_df, features=FEATURES, folder_path=fold_path, fold=fold, target="gross")

    # D·ª± ƒëo√°n v√† ƒë√°nh gi√°
    log_pred = model.predict(X_test)
    y_pred = np.expm1(log_pred)  # Chuy·ªÉn ƒë·ªïi v·ªÅ gi√° tr·ªã g·ªëc

    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    print(f"üìä Fold {fold} - R2: {r2:.4f} | MAE: {mae:.4f} | MAPE: {mape:.4f}")
    fold_scores.append((fold, r2, mae, mape))


üîç Testing with model from Fold 1
üìä Fold 1 - R2: 0.5889 | MAE: 62763633.3991 | MAPE: 44.9413

üîç Testing with model from Fold 2
üìä Fold 2 - R2: 0.5705 | MAE: 65795168.8043 | MAPE: 40.3588

üîç Testing with model from Fold 3
üìä Fold 3 - R2: 0.5775 | MAE: 64040804.1345 | MAPE: 20.8085

üîç Testing with model from Fold 4
üìä Fold 4 - R2: 0.5737 | MAE: 62959194.6230 | MAPE: 32.8189

üîç Testing with model from Fold 5
üìä Fold 5 - R2: 0.5668 | MAE: 66462841.9493 | MAPE: 59.3389


In [49]:
# Trung b√¨nh k·∫øt qu·∫£ c√°c fold
r2_avg = np.mean([s[1] for s in fold_scores])
mae_avg = np.mean([s[2] for s in fold_scores])
mape_avg = np.mean([s[3] for s in fold_scores])

print(f"\nüìà AVERAGE over folds - R2: {r2_avg:.4f} | MAE: {mae_avg:.4f} | MAPE: {mape_avg:.4f}")


üìà AVERAGE over folds - R2: 0.5755 | MAE: 64404328.5820 | MAPE: 39.6533


In [50]:
all_preds = []
y_test = None  # Ch·ªâ c·∫ßn l·∫•y y_test m·ªôt l·∫ßn

for fold in range(1, 6):
    print(f"\nüîç Loading model and encoding from Fold {fold}")

    fold_dir = f"best_models/GB_model/fold_{fold}"
    model_path = os.path.join(fold_dir, "model.pkl")

    if not os.path.exists(model_path):
        print(f"‚ö†Ô∏è Missing model for fold {fold}")
        continue

    model = joblib.load(model_path)

    # X·ª≠ l√Ω d·ªØ li·ªáu test (s·ª≠ d·ª•ng encoding + scaler c·ªßa t·ª´ng fold)
    fold_path = f"best_models/GB_model"
    X_test_fold, y_test_fold = load_data_test(test_df, features=FEATURES, folder_path=fold_path, fold=fold, target="gross")

    # Ch·ªâ c·∫ßn l∆∞u y_test m·ªôt l·∫ßn (gi·ªëng nhau cho m·ªçi fold)
    if y_test is None:
        y_test = y_test_fold

    # D·ª± ƒëo√°n v√† l∆∞u l·∫°i
    log_pred = model.predict(X_test_fold)  # S·ª≠ d·ª•ng X_test_fold thay v√¨ X_test
    y_pred_fold = np.expm1(log_pred)  # Chuy·ªÉn ƒë·ªïi v·ªÅ gi√° tr·ªã g·ªëc

    all_preds.append(y_pred_fold)

# Trung b√¨nh d·ª± ƒëo√°n t·ª´ t·∫•t c·∫£ m√¥ h√¨nh
y_pred_avg = np.mean(all_preds, axis=0)

# ƒê√°nh gi√°
r2 = r2_score(y_test, y_pred_avg)
mae = mean_absolute_error(y_test, y_pred_avg)
mape = mean_absolute_percentage_error(y_test, y_pred_avg)

print(f"\nüìä Ensemble Results (Average of 5 folds):")
print(f"üîπ R2: {r2:.4f} | MAE: {mae:.4f} | MAPE: {mape:.4f}")



üîç Loading model and encoding from Fold 1

üîç Loading model and encoding from Fold 2

üîç Loading model and encoding from Fold 3

üîç Loading model and encoding from Fold 4

üîç Loading model and encoding from Fold 5

üìä Ensemble Results (Average of 5 folds):
üîπ R2: 0.5878 | MAE: 63106128.2808 | MAPE: 39.6263
