In [14]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [5]:
# 하이퍼파라미터 
class config: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 100
    test_size = 0.2

In [2]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(config.seed)

# data 불러오기

In [3]:
history = pd.read_csv('../data/history_data.csv')
meta = pd.read_csv('../data/meta_data.csv')
profile = pd.read_csv('../data/profile_data.csv')

In [32]:
meta.head()

Unnamed: 0,album_id,title,sub_title,genre_large,genre_mid,genre_small,country,run_time,onair_date,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7
0,749,어둠이 무서워요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
1,750,우리는 친구,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
2,2131,타요의 첫 운행,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
3,2625,길 잃은 타요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
4,2594,새내기 꼬마 버스의 하루,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,


- `genre_large`, `genre_mid` 컬럼 사용

In [52]:
meta.shape

(42602, 16)

In [53]:
meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id','genre_large','genre_mid']).shape

(39875, 3)

In [54]:
meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id']).shape

(39875, 3)

In [55]:
meta_use = meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id'])

In [37]:
profile.head()

Unnamed: 0,profile_id,sex,age,pr_interest_keyword_cd_1,pr_interest_keyword_cd_2,pr_interest_keyword_cd_3,ch_interest_keyword_cd_1,ch_interest_keyword_cd_2,ch_interest_keyword_cd_3
0,3,F,5,P02,P04,P07,K01,K03,K04
1,5,M,5,P07,P08,P06,K05,K08,K09
2,7,F,9,P05,P03,,K06,K04,
3,12,M,6,P03,P06,P02,K09,K07,K03
4,16,F,12,P03,P06,P01,K01,K06,K04


- `sex`, `age`, `pr_interest_keyword_cd_1`, `ch_interest_keyword_cd_1` 컬럼 사용

In [64]:
profile.shape

(8311, 9)

In [62]:
profile_use = profile[['profile_id','sex','age','pr_interest_keyword_cd_1','ch_interest_keyword_cd_1']]

In [63]:
profile_use.drop_duplicates().shape

(8311, 5)

# data merge, 전처리

In [71]:
# 데이터 전처리 (중복제거) 
history_use = history[['profile_id', 'log_time', 'album_id']]\
    .drop_duplicates(subset=['profile_id', 'album_id', 'log_time'])\
        .sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)\
            .drop('log_time', axis=1)

In [73]:
print(history_use.shape)
history_use.head()

(899252, 2)


Unnamed: 0,profile_id,album_id
0,3,15
1,3,16
2,3,17
3,3,18
4,3,19


In [88]:
# meta, profile 데이터 추가
history_meta = pd.merge(history_use, meta_use, on='album_id', how='left')
history_meta_profile = pd.merge(history_meta, profile_use, on='profile_id', how='left')

In [89]:
print(history_meta_profile.shape)
history_meta_profile.head()

(899252, 8)


Unnamed: 0,profile_id,album_id,genre_large,genre_mid,sex,age,pr_interest_keyword_cd_1,ch_interest_keyword_cd_1
0,3,15,키즈,노래율동,F,5,P02,K01
1,3,16,키즈,노래율동,F,5,P02,K01
2,3,17,키즈,노래율동,F,5,P02,K01
3,3,18,키즈,노래율동,F,5,P02,K01
4,3,19,키즈,노래율동,F,5,P02,K01


In [90]:
# 시청 여부 추가
history_meta_profile['ratings'] = 1

In [91]:
user_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["profile_id"].unique())
}

item_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["album_id"].unique())
}

genre_large_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["genre_large"].unique())
}
genre_mid_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["genre_mid"].unique())
}

sex_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["sex"].unique())
}
age_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["age"].unique())
}
pr_interest_keyword_cd_1_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["pr_interest_keyword_cd_1"].unique())
}
ch_interest_keyword_cd_1_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["ch_interest_keyword_cd_1"].unique())
}

history_meta_profile["profile_id"] = history_meta_profile["profile_id"].apply(lambda x: user_to_index[x])
history_meta_profile["album_id"] = history_meta_profile["album_id"].apply(lambda x: item_to_index[x])

history_meta_profile["genre_large"] = history_meta_profile["genre_large"].apply(lambda x: genre_large_to_index[x])
history_meta_profile["genre_mid"] = history_meta_profile["genre_mid"].apply(lambda x: genre_mid_to_index[x])

history_meta_profile["sex"] = history_meta_profile["sex"].apply(lambda x: sex_to_index[x])
history_meta_profile["age"] = history_meta_profile["age"].apply(lambda x: age_to_index[x])
history_meta_profile["pr_interest_keyword_cd_1"] = history_meta_profile["pr_interest_keyword_cd_1"].apply(lambda x: pr_interest_keyword_cd_1_to_index[x])
history_meta_profile["ch_interest_keyword_cd_1"] = history_meta_profile["ch_interest_keyword_cd_1"].apply(lambda x: ch_interest_keyword_cd_1_to_index[x])

history_meta_profile = history_meta_profile.to_numpy()

items = history_meta_profile[:, :-1].astype(np.intc)
targets = history_meta_profile[:, -1].astype(np.float32)
field_dims = np.max(items, axis=0) + 1

In [94]:
# 각 field 종류별 갯수
field_dims

array([ 8311, 20695,     6,    28,     2,    13,     8,     9],
      dtype=int32)

In [93]:
# 학습 및 검증 데이터 분리
train, valid = train_test_split(
    history_meta_profile, test_size=config.test_size, random_state=config.seed,
)
print('학습 데이터 크기:', train.shape)
print('검증 데이터 크기:', valid.shape)

학습 데이터 크기: (719401, 9)
검증 데이터 크기: (179851, 9)


# negative sampling 구현

In [None]:
class DeepFMDataset(Dataset):
    def __init__(self, data: pd.DataFrame):
        """dataset for DeepFM model

        Parameters
        ----------
        data : pd.DataFrame
            _description_
        """
        super().__init__()

        user_to_index = {
            original: idx for idx, original in enumerate(data["profile_id"].unique())
        }
        movie_to_index = {
            original: idx for idx, original in enumerate(data["album_id"].unique())
        }
        data["profile_id"] = data["profile_id"].apply(lambda x: user_to_index[x])
        data["album_id"] = data["album_id"].apply(lambda x: movie_to_index[x])
        data = data.to_numpy()

        self.items = data[:, :2].astype(np.intc)
        self.targets = data[:, 2].astype(np.float32)
        self.field_dims = np.max(self.items, axis=0) + 1

    def __len__(self):
        return self.targets.shape[0]

    def __getitem__(self, idx):
        return self.items[idx], self.targets[idx]