In [64]:
import sys
import pandas as pd
import numpy as np

# 경로 설정
DATA_PATH = "dataset/"
RESULT_PATH = "tmp/"

# 컬럼 정의
columns = ['user_id', 'item_id', 'rating', 'time_stamp']
user_columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
movie_columns_base = ['item_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL']
genre_columns = ['genre', 'genre_id']
occupation_columns = ['occupation']

In [65]:
try:
    # 데이터 로드
    train_df = pd.read_csv(DATA_PATH + 'u2.base', sep='\t', names=columns, encoding='latin-1')

    user_df = pd.read_csv(DATA_PATH + 'u.user', sep='|', names=user_columns, encoding='latin-1')
    genre_df = pd.read_csv(DATA_PATH + 'u.genre', sep='|', names=genre_columns, encoding='latin-1')
    occupation_df = pd.read_csv(DATA_PATH + 'u.occupation', sep='|', names=occupation_columns, encoding='latin-1')
    
    genre_map = dict(zip(genre_df['genre_id'], genre_df['genre']))
    all_movie_cols = movie_columns_base + [genre for genre in genre_map.values()]
    movie_df = pd.read_csv(DATA_PATH + 'u.item', sep='|', names=all_movie_cols, encoding='latin-1')

except Exception as e:
    print(f"Error loading files: {e}")
    sys.exit(1)

print(f"Data Loaded. Train: {train_df.shape}")

Data Loaded. Train: (80000, 4)


In [66]:
# 매핑
user_age_map = user_df.set_index('user_id')['age']
user_gender_map = user_df.set_index('user_id')['gender']
user_occupation_map = user_df.set_index('user_id')['occupation']
movie_release_map = movie_df.set_index('item_id')['release_date']
movie_genre_maps = {genre: movie_df.set_index('item_id')[genre] for genre in genre_map.values()}

# 피처 추가
train_df['age'] = train_df['user_id'].map(user_age_map)
train_df['gender'] = train_df['user_id'].map(user_gender_map)
train_df['occupation'] = train_df['user_id'].map(user_occupation_map)
train_df['release_date'] = train_df['item_id'].map(movie_release_map)
for genre, genre_map_tmp in movie_genre_maps.items():
    train_df[genre] = train_df['item_id'].map(genre_map_tmp)

In [67]:
genre_cols = list(genre_map.values()) 
item_genre_dict = {} # 영화별 장르 리스트
item_year_dict = {}  # 영화별 개봉 연도

for index, row in movie_df.iterrows():
    item_id = row['item_id']
    # 장르 리스트 생성
    active_genres = [genre for genre in genre_cols if row.get(genre, 0) == 1]
    item_genre_dict[item_id] = active_genres
    
    # 연도 추출
    try:
        date_str = str(row['release_date'])
        if len(date_str) > 4:
            item_year_dict[item_id] = date_str[-4:] # 연도 4자리
    except:
        pass

# 개봉 연도 피처 추가
train_df['release_year'] = train_df['item_id'].map(item_year_dict)

In [None]:
# occupation-genre 선호도 계산
occupation_genre_rating = {}
for occupation in occupation_df['occupation']:
    occ_data = train_df[train_df['occupation'] == occupation]
    if occ_data.empty: continue
    
    genre_rating = {}
    for genre_name in genre_cols:
        genre_items = movie_df[movie_df[genre_name] == 1]['item_id']
        ratings = occ_data[occ_data['item_id'].isin(genre_items)]['rating']
        if not ratings.empty:
            genre_rating[genre_name] = ratings.mean()
    occupation_genre_rating[occupation] = genre_rating

# 정규화
for occupation, ratings in occupation_genre_rating.items():
    if not ratings: continue
    max_r, min_r = max(ratings.values()), min(ratings.values())
    for genre in ratings:
        ratings[genre] = (ratings[genre] - min_r) / (max_r - min_r) if max_r > min_r else 0.5

# 적용 함수
def get_occ_genre_score(occupation, item_id):
    if occupation not in occupation_genre_rating: return 0.5
    genres = item_genre_dict.get(item_id, [])
    if not genres: return 0.5
    scores = [occupation_genre_rating[occupation].get(g, 0) for g in genres]
    return np.mean(scores) if scores else 0.5

# 적용
train_df['occ_genre_score'] = train_df.apply(lambda x: get_occ_genre_score(x['occupation'], x['item_id']), axis=1)

In [None]:
# gender-genre 선호도 계산
gender_genre_rating = {}
for gender in train_df['gender'].unique():
    g_data = train_df[train_df['gender'] == gender]
    if g_data.empty: continue
    genre_rating = {}
    for genre_name in genre_cols:
        genre_items = movie_df[movie_df[genre_name] == 1]['item_id']
        ratings = g_data[g_data['item_id'].isin(genre_items)]['rating']
        if not ratings.empty:
            genre_rating[genre_name] = ratings.mean()
    gender_genre_rating[gender] = genre_rating

# 정규화
for gender, ratings in gender_genre_rating.items():
    if not ratings: continue
    max_r, min_r = max(ratings.values()), min(ratings.values())
    for genre in ratings:
        ratings[genre] = (ratings[genre] - min_r) / (max_r - min_r) if max_r > min_r else 0.5

# 적용 함수
def get_gender_genre_score(gender, item_id):
    if gender not in gender_genre_rating: return 0.5
    genres = item_genre_dict.get(item_id, [])
    if not genres: return 0.5
    scores = [gender_genre_rating[gender].get(g, 0) for g in genres]
    return np.mean(scores) if scores else 0.5

# 적용
train_df['gender_genre_score'] = train_df.apply(lambda x: get_gender_genre_score(x['gender'], x['item_id']), axis=1)

In [70]:
# 전체 평균 (Test 데이터의 Cold Start 채우기 용도)
global_mean = train_df['rating'].mean()

# 유저 별 평균 평점
user_avg_rating = train_df.groupby('user_id')['rating'].mean()
train_df['user_avg_rating'] = train_df['user_id'].map(user_avg_rating)

# 영화 별 평균 평점
movie_avg_rating = train_df.groupby('item_id')['rating'].mean()
train_df['movie_avg_rating'] = train_df['item_id'].map(movie_avg_rating)

In [None]:
# 불필요한 컬럼 제거
drop_cols = ['user_id', 'item_id', 'time_stamp', 'release_date']

train_df.drop(columns=drop_cols, inplace=True)

In [72]:
print("\n------ Final Data Info ------")
print(f"Train Shape: {train_df.shape}")
print("\n[Train Sample Columns]")
train_df.head()


------ Final Data Info ------
Train Shape: (80000, 28)

[Train Sample Columns]


Unnamed: 0,rating,age,gender,occupation,unknown,Action,Adventure,Animation,Children's,Comedy,...,Romance,Sci-Fi,Thriller,War,Western,release_year,occ_genre_score,gender_genre_score,user_avg_rating,movie_avg_rating
0,4,24,M,technician,0,0,0,0,0,0,...,0,0,1,0,0,1995,0.427051,0.449058,3.633028,3.015385
1,3,24,M,technician,0,1,0,0,0,1,...,0,0,0,0,0,1995,0.43369,0.455282,3.633028,3.534884
2,3,24,M,technician,0,0,0,0,0,0,...,0,0,1,0,0,1995,0.49597,0.574085,3.633028,3.272727
3,5,24,M,technician,0,0,0,0,0,0,...,0,0,0,0,0,1995,0.600326,0.658881,3.633028,3.590909
4,4,24,M,technician,0,0,0,0,0,0,...,0,1,0,0,0,1995,0.570254,0.589824,3.633028,3.811321


In [73]:
train_df.to_csv(RESULT_PATH + 'train_preprocessed.csv', index=False)