In [60]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from itertools import combinations

In [61]:
# 데이터 준비
DATA_PATH = "dataset/"
TMP_PATH = "tmp/"

genre_columns = ['genre', 'genre_id']

train_df = pd.read_csv(TMP_PATH + 'train_preprocessed.csv')
genre_df = pd.read_csv(DATA_PATH + 'u.genre', sep='|', names=genre_columns, encoding='latin-1')

# 범주형 데이터 타입 변환
cat_cols = ['gender', 'occupation']
for col in cat_cols:
    if col in train_df.columns:
        train_df[col] = train_df[col].astype('category')

genre_map = dict(zip(genre_df['genre_id'], genre_df['genre']))
genre_features = list(genre_map.values())

# 후보 피처 리스트
individual_features = [
    'user_avg_rating', 'movie_avg_rating',  # 통계 피처
    'occ_genre_score', 'gender_genre_score', # 상호작용 피처
    'age', 'release_year', # 수치형 피처
    'gender', 'occupation', # 범주형 피처
]

# 실제로 df에 존재하는 컬럼만 남기기
individual_features = [col for col in individual_features if col in train_df.columns]
candidate_features = individual_features + [genre_features]

# 타겟 변수
target = 'rating'

# 검증을 위해 데이터 분리 (8:2)
all_possible_cols = individual_features + genre_features
X = train_df[all_possible_cols]
y = train_df[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 모든 피처 조합 탐색
best_rmse = float('inf')
best_combination = []
history = []

print(f"Testing all combinations for {len(candidate_features)} features...")
print("-" * 60)

total_combinations = sum(1 for r in range(1, len(candidate_features) + 1) for _ in combinations(candidate_features, r))
count = 0

for r in range(1, len(candidate_features) + 1):
    for subset in combinations(candidate_features, r):
        count += 1

        current_features = []
        for item in subset:
            if isinstance(item, list): # 장르 묶음인 경우
                current_features.extend(item) # 리스트 내부 요소를 풂
            else: # 일반 피처인 경우
                current_features.append(item)
        
        model = xgb.XGBRegressor(
            objective='reg:squarederror',
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=6,
            enable_categorical=True, 
            tree_method='hist',      
            n_jobs=-1,
            random_state=42,
            early_stopping_rounds=50
        )
        
        model.fit(
            X_train[current_features], y_train,
            eval_set=[(X_val[current_features], y_val)],
            verbose=False 
        )
        
        # 예측 및 평가
        preds = model.predict(X_val[current_features])
        rmse = root_mean_squared_error(y_val, preds)
        
        # 기록 저장
        history.append((rmse, current_features))
        
        # 최고 기록 갱신 시 출력
        if rmse < best_rmse:
            best_rmse = rmse
            best_combination = current_features
            print(f"[{count}/{total_combinations}] New Best! RMSE: {best_rmse:.4f} | Features: {len(current_features)}개")
            print(f"   -> {current_features}")

Testing all combinations for 9 features...
------------------------------------------------------------
[1/511] New Best! RMSE: 1.0251 | Features: 1개
   -> ['user_avg_rating']
[2/511] New Best! RMSE: 0.9997 | Features: 1개
   -> ['movie_avg_rating']
[10/511] New Best! RMSE: 0.9184 | Features: 2개
   -> ['user_avg_rating', 'movie_avg_rating']
[46/511] New Best! RMSE: 0.9168 | Features: 3개
   -> ['user_avg_rating', 'movie_avg_rating', 'occ_genre_score']
[48/511] New Best! RMSE: 0.9139 | Features: 3개
   -> ['user_avg_rating', 'movie_avg_rating', 'age']
[131/511] New Best! RMSE: 0.9132 | Features: 4개
   -> ['user_avg_rating', 'movie_avg_rating', 'occ_genre_score', 'age']
[141/511] New Best! RMSE: 0.9131 | Features: 4개
   -> ['user_avg_rating', 'movie_avg_rating', 'age', 'release_year']
[144/511] New Best! RMSE: 0.9090 | Features: 22개
   -> ['user_avg_rating', 'movie_avg_rating', 'age', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fa

In [63]:
# 최종 결과 리포트
print("-" * 60)
print(f"Best RMSE: {best_rmse:.4f}")
print(f"Best Feature Combination ({len(best_combination)} features):")
print(best_combination)

------------------------------------------------------------
Best RMSE: 0.9066
Best Feature Combination (25 features):
['user_avg_rating', 'movie_avg_rating', 'age', 'release_year', 'gender', 'occupation', 'unknown', 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
