In [1]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from collections import defaultdict
import os, random

from scipy import sparse
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

In [53]:
# 하이퍼파라미터 
class config: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 1
    test_size = 0.2

In [3]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(config.seed)

# data 불러오기

In [4]:
history = pd.read_csv('../data/history_data.csv')
meta = pd.read_csv('../data/meta_data.csv')
profile = pd.read_csv('../data/profile_data.csv')

In [5]:
meta.head()

Unnamed: 0,album_id,title,sub_title,genre_large,genre_mid,genre_small,country,run_time,onair_date,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7
0,749,어둠이 무서워요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
1,750,우리는 친구,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
2,2131,타요의 첫 운행,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
3,2625,길 잃은 타요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
4,2594,새내기 꼬마 버스의 하루,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,


- `genre_large`, `genre_mid` 컬럼 사용

In [6]:
meta.shape

(42602, 16)

In [7]:
meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id','genre_large','genre_mid']).shape

(39875, 3)

In [8]:
meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id']).shape

(39875, 3)

In [9]:
meta_use = meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id'])

In [10]:
profile.head()

Unnamed: 0,profile_id,sex,age,pr_interest_keyword_cd_1,pr_interest_keyword_cd_2,pr_interest_keyword_cd_3,ch_interest_keyword_cd_1,ch_interest_keyword_cd_2,ch_interest_keyword_cd_3
0,3,F,5,P02,P04,P07,K01,K03,K04
1,5,M,5,P07,P08,P06,K05,K08,K09
2,7,F,9,P05,P03,,K06,K04,
3,12,M,6,P03,P06,P02,K09,K07,K03
4,16,F,12,P03,P06,P01,K01,K06,K04


- `sex`, `age`, `pr_interest_keyword_cd_1`, `ch_interest_keyword_cd_1` 컬럼 사용

In [11]:
profile.shape

(8311, 9)

In [12]:
profile_use = profile[['profile_id','sex','age','pr_interest_keyword_cd_1','ch_interest_keyword_cd_1']]

In [13]:
profile_use.drop_duplicates().shape

(8311, 5)

# data merge, 전처리

In [14]:
# 데이터 전처리 (중복제거) 
history_use = history[['profile_id', 'log_time', 'album_id']]\
    .drop_duplicates(subset=['profile_id', 'album_id', 'log_time'])\
        .sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)\
            .drop('log_time', axis=1)

In [15]:
print(history_use.shape)
history_use.head()

(899252, 2)


Unnamed: 0,profile_id,album_id
0,3,15
1,3,16
2,3,17
3,3,18
4,3,19


In [45]:
# meta, profile 데이터 추가
history_meta = pd.merge(history_use, meta_use, on='album_id', how='left')
history_meta_profile = pd.merge(history_meta, profile_use, on='profile_id', how='left')

In [46]:
print(history_meta_profile.shape)
history_meta_profile.head()

(899252, 8)


Unnamed: 0,profile_id,album_id,genre_large,genre_mid,sex,age,pr_interest_keyword_cd_1,ch_interest_keyword_cd_1
0,3,15,키즈,노래율동,F,5,P02,K01
1,3,16,키즈,노래율동,F,5,P02,K01
2,3,17,키즈,노래율동,F,5,P02,K01
3,3,18,키즈,노래율동,F,5,P02,K01
4,3,19,키즈,노래율동,F,5,P02,K01


In [47]:
# 시청 여부 추가
history_meta_profile['ratings'] = 1

In [48]:
user_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["profile_id"].unique())
}

item_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["album_id"].unique())
}

genre_large_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["genre_large"].unique())
}
genre_mid_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["genre_mid"].unique())
}

sex_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["sex"].unique())
}
age_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["age"].unique())
}
pr_interest_keyword_cd_1_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["pr_interest_keyword_cd_1"].unique())
}
ch_interest_keyword_cd_1_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["ch_interest_keyword_cd_1"].unique())
}

In [63]:
user_ids = history_meta_profile['profile_id'].unique()
item_ids = history_meta_profile['album_id'].unique()

In [50]:
# 학습 및 검증 데이터 분리
train, valid = train_test_split(
    history_meta_profile, test_size=config.test_size, random_state=config.seed,
)
print('학습 데이터 크기:', train.shape)
print('검증 데이터 크기:', valid.shape)

학습 데이터 크기: (719401, 9)
검증 데이터 크기: (179851, 9)


In [51]:
n_only_valid_user = len(set(valid.iloc[:, 0]) - set(train.iloc[:, 0]))
n_only_valid_item = len(set(valid.iloc[:, 1]) - set(train.iloc[:, 1]))

print(f'valid에만 있는 유저의 수: {n_only_valid_user}')
print(f'valid에만 있는 아이템의 수: {n_only_valid_item}')

# 그래도 negative sampling하면 이 수는 줄어들 것이다

valid에만 있는 유저의 수: 102
valid에만 있는 아이템의 수: 775


In [52]:
train.head()

Unnamed: 0,profile_id,album_id,genre_large,genre_mid,sex,age,pr_interest_keyword_cd_1,ch_interest_keyword_cd_1,ratings
80305,2367,3508,키즈,노래율동,F,3,P06,K01,1
566567,16852,1992,키즈,TV만화,F,5,P02,K08,1
553068,16213,2032,키즈,책,M,8,P01,K08,1
53822,1812,18,키즈,노래율동,M,3,P06,K01,1
52051,1729,6340,키즈,책,F,7,P02,K01,1


# negative sampling 구현
- 방법
    - 각 유저별로 negative sample 추출
    - 유저별 postive sample수에 비례하여서 negative sample 추출
    - train data에만 negative sampling, valid에는 rating=1만 있음
- sampling 방법
    - random
    - not random (popular에게 약간의 가중치)

In [98]:
n_total_neg_samples = len(train) * config.neg_ratio

In [99]:
neg_samples = np.zeros((n_total_neg_samples, train.shape[1]))
neg_samples = pd.DataFrame(neg_samples)
neg_samples.columns = train.columns

In [100]:
idx = 0
for id, other_features in train.groupby('profile_id'):
    pos_samples = other_features['album_id'].values
    n_neg_samples = len(pos_samples) * config.neg_ratio
    neg_sample_candidates = list(set(item_ids) - set(pos_samples))
    neg_item_ids = np.random.choice(neg_sample_candidates, min(n_neg_samples, len(neg_sample_candidates)),replace=False)
    # 결과 넣기
    neg_samples.iloc[idx:idx+n_neg_samples, 0] = id
    neg_samples.iloc[idx:idx+n_neg_samples, 1:4] = meta_use.loc[meta_use['album_id'].isin(neg_item_ids), :].values
    neg_samples.iloc[idx:idx+n_neg_samples, 4:-1] =  profile_use.loc[profile_use['profile_id'] == id, 'sex':].values
    neg_samples.iloc[idx:idx+n_neg_samples, -1] = 0
    # idx 수정
    idx += n_neg_samples

In [101]:
neg_samples.head()

Unnamed: 0,profile_id,album_id,genre_large,genre_mid,sex,age,pr_interest_keyword_cd_1,ch_interest_keyword_cd_1,ratings
0,3.0,20917.0,키즈,놀이교실,F,5.0,P02,K01,0.0
1,3.0,15846.0,키즈,TV만화,F,5.0,P02,K01,0.0
2,3.0,22646.0,키즈,애니,F,5.0,P02,K01,0.0
3,3.0,5934.0,키즈,독서동화,F,5.0,P02,K01,0.0
4,3.0,10101.0,키즈,놀이교실,F,5.0,P02,K01,0.0


In [102]:
neg_samples.tail()

Unnamed: 0,profile_id,album_id,genre_large,genre_mid,sex,age,pr_interest_keyword_cd_1,ch_interest_keyword_cd_1,ratings
719396,33032.0,8103.0,키즈,외국어,F,5.0,P03,K01,0.0
719397,33032.0,8124.0,키즈,외국어,F,5.0,P03,K01,0.0
719398,33032.0,10625.0,키즈,노래율동,F,5.0,P03,K01,0.0
719399,33032.0,4540.0,키즈,TV만화,F,5.0,P03,K01,0.0
719400,33032.0,14756.0,키즈,노래율동,F,5.0,P03,K01,0.0


In [103]:
final_train = pd.concat([train, neg_samples])

In [105]:
final_train["profile_id"] = final_train["profile_id"].apply(lambda x: user_to_index[x])
final_train["album_id"] = final_train["album_id"].apply(lambda x: item_to_index[x])

final_train["genre_large"] = final_train["genre_large"].apply(lambda x: genre_large_to_index[x])
final_train["genre_mid"] = final_train["genre_mid"].apply(lambda x: genre_mid_to_index[x])

final_train["sex"] = final_train["sex"].apply(lambda x: sex_to_index[x])
final_train["age"] = final_train["age"].apply(lambda x: age_to_index[x])
final_train["pr_interest_keyword_cd_1"] = final_train["pr_interest_keyword_cd_1"].apply(lambda x: pr_interest_keyword_cd_1_to_index[x])
final_train["ch_interest_keyword_cd_1"] = final_train["ch_interest_keyword_cd_1"].apply(lambda x: ch_interest_keyword_cd_1_to_index[x])

final_train_np = final_train.to_numpy()
final_train_np = final_train_np.astype(np.intc)

items = final_train_np[:, :-1]
field_dims = np.max(items, axis=0) + 1

In [111]:
# field 종류별 dim 수
field_dims

array([ 8311, 20695,     6,    28,     2,    13,     8,     9],
      dtype=int32)