In [2]:
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np

# Load data
df = pd.read_csv("movies_data_processed.csv")  # hoặc thay bằng dữ liệu có sẵn

# Bỏ cột không cần thiết
df = df.drop(columns=["name", "url"])

# Tách multi-label
def expand_column(df, col):
    return df[col].str.split(', ').apply(lambda x: [i.strip() for i in x])

df['genres_list'] = expand_column(df, 'genres')
df['countries_list'] = expand_column(df, 'countries')

# Cross-fitting target encoding
def crossfit_target_encode(df, list_col, target_col, n_splits=5, smoothing=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    encoded = np.zeros(len(df))

    for train_idx, val_idx in kf.split(df):
        train, val = df.iloc[train_idx], df.iloc[val_idx]
        global_mean = train[target_col].mean()

        # Đếm và tính trung bình theo từng nhãn
        label_stats = {}
        for labels, target in zip(train[list_col], train[target_col]):
            for label in labels:
                if label not in label_stats:
                    label_stats[label] = []
                label_stats[label].append(target)
        label_mean = {k: np.mean(v) for k, v in label_stats.items()}
        label_count = {k: len(v) for k, v in label_stats.items()}

        # Tính encoding cho mỗi sample
        for i in val_idx:
            labels = df.iloc[i][list_col]
            vals = []
            for label in labels:
                mean = label_mean.get(label, global_mean)
                count = label_count.get(label, 0)
                smooth = (count * mean + smoothing * global_mean) / (count + smoothing)
                vals.append(smooth)
            encoded[i] = np.mean(vals)

    return encoded

# Encode genres và countries
df['genres_encoded'] = crossfit_target_encode(df, 'genres_list', 'gross')
df['countries_encoded'] = crossfit_target_encode(df, 'countries_list', 'gross')

# log_tranform
df['gross_log'] = np.log1p(df['gross'])

# Ghi ra file
df.drop(columns=['genres', 'countries', 'genres_list', 'countries_list']).to_csv("encoded_output.csv", index=False)


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.utils import shuffle

In [3]:
# Load data
df = pd.read_csv("movies_data_processed.csv")
df = df.drop(columns=["name", "url"])

# Tách multi-label
def expand_column(df, col):
    return df[col].str.split(', ').apply(lambda x: [i.strip() for i in x])

df['genres_list'] = expand_column(df, 'genres')
df['countries_list'] = expand_column(df, 'countries')

# Thêm log gross để phân phối hợp lý hơn
df['gross_log'] = np.log1p(df['gross'])

# Phân loại theo quantile để stratify
df['gross_bin'] = pd.qcut(df['gross_log'], q=5, labels=False)

df.head()

Unnamed: 0,genres,rating,no_of_votes,gross,budget,countries,genres_list,countries_list,gross_log,gross_bin
0,"Martial Arts, Action, Comedy",5.3,6800,17235040.0,18000000.0,United States,"[Martial Arts, Action, Comedy]",[United States],16.662455,1
1,"Action, Drama, Romance, Thriller, Western",6.5,107000,18636537.0,32000000.0,"United States, Japan","[Action, Drama, Romance, Thriller, Western]","[United States, Japan]",16.740635,1
2,"Animal Adventure, Computer Animation, Fairy Ta...",7.8,200000,481757663.0,90000000.0,"United States, Japan","[Animal Adventure, Computer Animation, Fairy T...","[United States, Japan]",19.992952,4
3,"Sci-Fi Epic, Space Sci-Fi, Action, Adventure, ...",7.6,524000,263920180.0,25000000.0,"France, United Kingdom","[Sci-Fi Epic, Space Sci-Fi, Action, Adventure,...","[France, United Kingdom]",19.391157,4
4,"Sci-Fi Epic, Superhero, Urban Adventure, Actio...",7.4,195000,300478449.0,55000000.0,"United States, United Kingdom, Switzerland, Pa...","[Sci-Fi Epic, Superhero, Urban Adventure, Acti...","[United States, United Kingdom, Switzerland, P...",19.520887,4


In [4]:
# Chia tập train/test (stratify theo gross_bin để phân phối đều)
df = shuffle(df, random_state=42)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['gross_bin'])