In [2]:
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np

# Load data
df = pd.read_csv("movies_data_processed.csv")  # hoặc thay bằng dữ liệu có sẵn

# Bỏ cột không cần thiết
df = df.drop(columns=["name", "url"])

# Tách multi-label
def expand_column(df, col):
    return df[col].str.split(', ').apply(lambda x: [i.strip() for i in x])

df['genres_list'] = expand_column(df, 'genres')
df['countries_list'] = expand_column(df, 'countries')

# Cross-fitting target encoding
def crossfit_target_encode(df, list_col, target_col, n_splits=5, smoothing=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    encoded = np.zeros(len(df))

    for train_idx, val_idx in kf.split(df):
        train, val = df.iloc[train_idx], df.iloc[val_idx]
        global_mean = train[target_col].mean()

        # Đếm và tính trung bình theo từng nhãn
        label_stats = {}
        for labels, target in zip(train[list_col], train[target_col]):
            for label in labels:
                if label not in label_stats:
                    label_stats[label] = []
                label_stats[label].append(target)
        label_mean = {k: np.mean(v) for k, v in label_stats.items()}
        label_count = {k: len(v) for k, v in label_stats.items()}

        # Tính encoding cho mỗi sample
        for i in val_idx:
            labels = df.iloc[i][list_col]
            vals = []
            for label in labels:
                mean = label_mean.get(label, global_mean)
                count = label_count.get(label, 0)
                smooth = (count * mean + smoothing * global_mean) / (count + smoothing)
                vals.append(smooth)
            encoded[i] = np.mean(vals)

    return encoded

# Encode genres và countries
df['genres_encoded'] = crossfit_target_encode(df, 'genres_list', 'gross')
df['countries_encoded'] = crossfit_target_encode(df, 'countries_list', 'gross')

# log_tranform
df['gross_log'] = np.log1p(df['gross'])

# Ghi ra file
df.drop(columns=['genres', 'countries', 'genres_list', 'countries_list']).to_csv("encoded_output.csv", index=False)
