# submission
- 모든 유저들에 대한 추천리스트 만드는 과정

In [24]:
# 패키지 로드
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import random

import torch

In [25]:
# 하이퍼파라미터 
class config: 
    gpu_idx = 0
    device = torch.device("cuda:{}".format(gpu_idx) if torch.cuda.is_available() else "cpu")
    top_k = 25
    seed = 42
    neg_ratio = 1
    test_size = 0.2

In [26]:
# 시드 고정 
def seed_everything(random_seed):
    torch.manual_seed(random_seed)
    torch.cuda.manual_seed(random_seed)
    torch.cuda.manual_seed_all(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)
    
seed_everything(config.seed)

# data 불러오기

In [27]:
history = pd.read_csv('../data/history_data.csv')
meta = pd.read_csv('../data/meta_data.csv')
profile = pd.read_csv('../data/profile_data.csv')

In [28]:
meta.head()

Unnamed: 0,album_id,title,sub_title,genre_large,genre_mid,genre_small,country,run_time,onair_date,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7
0,749,어둠이 무서워요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
1,750,우리는 친구,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
2,2131,타요의 첫 운행,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
3,2625,길 잃은 타요,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,
4,2594,새내기 꼬마 버스의 하루,꼬마버스 타요1,키즈,TV만화,,한국,660,,타요,로기,라니,가니,시투,,


- `genre_large`, `genre_mid` 컬럼 사용

In [29]:
meta.shape

(42602, 16)

In [30]:
meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id','genre_large','genre_mid']).shape

(39875, 3)

In [31]:
meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id']).shape

(39875, 3)

In [32]:
meta_use = meta[['album_id','genre_large','genre_mid']].drop_duplicates(['album_id'])

In [33]:
profile.head()

Unnamed: 0,profile_id,sex,age,pr_interest_keyword_cd_1,pr_interest_keyword_cd_2,pr_interest_keyword_cd_3,ch_interest_keyword_cd_1,ch_interest_keyword_cd_2,ch_interest_keyword_cd_3
0,3,F,5,P02,P04,P07,K01,K03,K04
1,5,M,5,P07,P08,P06,K05,K08,K09
2,7,F,9,P05,P03,,K06,K04,
3,12,M,6,P03,P06,P02,K09,K07,K03
4,16,F,12,P03,P06,P01,K01,K06,K04


- `sex`, `age`, `pr_interest_keyword_cd_1`, `ch_interest_keyword_cd_1` 컬럼 사용

In [34]:
profile.shape

(8311, 9)

In [35]:
profile_use = profile[['profile_id','sex','age','pr_interest_keyword_cd_1','ch_interest_keyword_cd_1']]

In [36]:
profile_use.drop_duplicates().shape

(8311, 5)

# data merge, 전처리

In [37]:
# 데이터 전처리 (중복제거) 
history_use = history[['profile_id', 'log_time', 'album_id']]\
    .drop_duplicates(subset=['profile_id', 'album_id', 'log_time'])\
        .sort_values(by = ['profile_id', 'log_time']).reset_index(drop = True)\
            .drop('log_time', axis=1)

In [38]:
print(history_use.shape)
history_use.head()

(899252, 2)


Unnamed: 0,profile_id,album_id
0,3,15
1,3,16
2,3,17
3,3,18
4,3,19


In [39]:
# meta, profile 데이터 추가
history_meta = pd.merge(history_use, meta_use, on='album_id', how='left')
history_meta_profile = pd.merge(history_meta, profile_use, on='profile_id', how='left')

In [40]:
print(history_meta_profile.shape)
history_meta_profile.head()

(899252, 8)


Unnamed: 0,profile_id,album_id,genre_large,genre_mid,sex,age,pr_interest_keyword_cd_1,ch_interest_keyword_cd_1
0,3,15,키즈,노래율동,F,5,P02,K01
1,3,16,키즈,노래율동,F,5,P02,K01
2,3,17,키즈,노래율동,F,5,P02,K01
3,3,18,키즈,노래율동,F,5,P02,K01
4,3,19,키즈,노래율동,F,5,P02,K01


In [41]:
# 시청 여부 추가
history_meta_profile['ratings'] = 1

In [42]:
user_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["profile_id"].unique())
}

item_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["album_id"].unique())
}

genre_large_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["genre_large"].unique())
}
genre_mid_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["genre_mid"].unique())
}

sex_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["sex"].unique())
}
age_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["age"].unique())
}
pr_interest_keyword_cd_1_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["pr_interest_keyword_cd_1"].unique())
}
ch_interest_keyword_cd_1_to_index = {
original: idx for idx, original in enumerate(history_meta_profile["ch_interest_keyword_cd_1"].unique())
}

In [43]:
# history_meta_profile["profile_id"] = history_meta_profile["profile_id"].apply(lambda x: user_to_index[x])
# history_meta_profile["album_id"] = history_meta_profile["album_id"].apply(lambda x: item_to_index[x])

# history_meta_profile["genre_large"] = history_meta_profile["genre_large"].apply(lambda x: genre_large_to_index[x])
# history_meta_profile["genre_mid"] = history_meta_profile["genre_mid"].apply(lambda x: genre_mid_to_index[x])

# history_meta_profile["sex"] = history_meta_profile["sex"].apply(lambda x: sex_to_index[x])
# history_meta_profile["age"] = history_meta_profile["age"].apply(lambda x: age_to_index[x])
# history_meta_profile["pr_interest_keyword_cd_1"] = history_meta_profile["pr_interest_keyword_cd_1"].apply(lambda x: pr_interest_keyword_cd_1_to_index[x])
# history_meta_profile["ch_interest_keyword_cd_1"] = history_meta_profile["ch_interest_keyword_cd_1"].apply(lambda x: ch_interest_keyword_cd_1_to_index[x])

# history_meta_profile_np = history_meta_profile.to_numpy()
# history_meta_profile_np = history_meta_profile_np.astype(np.intc)

# items = history_meta_profile_np[:, :-1]
# field_dims = np.max(items, axis=0) + 1

## user마다 모든 item에 대한 input data를 만들어서 예측하고 평가

In [44]:
from typing import List

import numpy as np
import torch
import torch.nn as nn
from torch.nn.init import normal_

class FMLinear(nn.Module):
    def __init__(self, field_dims: List[int], output_dim: int = 1):
        """linear part in FM component

        Parameters
        ----------
        field_dims : List[int]
            dimension of each field
        output_dim : int, optional
            always 1 because it is for linear term, by default 1
        """
        super().__init__()

        self.fc = nn.Embedding(sum(field_dims), output_dim)
        self.bias = nn.parameter(torch.zeros((output_dim,)))
        self.offsets = np.array(
            (0, *np.cumsum(field_dims)[:-1]), dtype=np.int_
        )  # 새로운 종류의 field가 시작하는 index

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """_summary_

        Parameters
        ----------
        x : torch.Tensor
            input data

        Returns
        -------
        torch.Tensor
            value of linear term
        """
        # |x| = (batch_size, num_fields)
        # 여기서 num_fields는 각 종류의 field안에서 user, item의 각 index
        # 그래서 offset을 더해줘야 embedding layer에서 원하는 weight를 뽑아낼 수 있다
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return torch.sum(self.fc(x), dim=1) + self.bias


class FeatureEmbedding(nn.Module):
    def __init__(self, field_dims: List[int], embed_dim: int):
        """embedding part for FM and Deep Component

        Parameters
        ----------
        field_dims : List[int]
            dimension of each field
        embed_dim : int
            embedding dimensions
        """
        super().__init__()
        self.embedding = nn.Embedding(sum(field_dims), embed_dim)
        self.offsets = np.array((0, *np.cumsum(field_dims)[:-1]), dtype=np.int_)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """_summary_

        Parameters
        ----------
        x : torch.Tensor
            input data

        Returns
        -------
        torch.Tensor
            x's embedding vectors
        """
        # |x| = (batch_size, num_fields)
        x = x + x.new_tensor(self.offsets).unsqueeze(0)
        return self.embedding(x)


class FMInteraction(nn.Module):
    def __init__(self):
        """interaction term in FM"""
        super().__init__()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """_summary_

        Parameters
        ----------
        x : torch.Tensor
            input data

        Returns
        -------
        torch.Tensor
            _description_
        """
        # |x| = (batch_size, num_fields, embed_dim)
        square_of_sum = torch.sum(x, dim=1) ** 2
        sum_of_square = torch.sum(x**2, dim=1)
        ix = square_of_sum - sum_of_square
        ix = torch.sum(ix, dim=1, keepdim=True)
        return 0.5 * ix


class MLP(nn.Module):
    def __init__(
        self,
        input_dim: int,
        embed_dims: List[int],
        dropout: float,
        output_layer: bool = True,
    ):
        """MLP part in Deep Component

        Parameters
        ----------
        input_dim : int
            _description_
        embed_dims : List[int]
            _description_
        dropout : float
            _description_
        output_layer : bool, optional
            _description_, by default True
        """
        super().__init__()
        layers = list()
        for embed_dim in embed_dims:
            layers.append(torch.nn.Linear(input_dim, embed_dim))
            layers.append(torch.nn.BatchNorm1d(embed_dim))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = embed_dim
        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Embedding):
            normal_(module.weight.data, mean=0.0, std=0.01)
        elif isinstance(module, nn.Linear):
            normal_(module.weight.data, 0, 0.01)
            if module.bias is not None:
                module.bias.data.fill_(0.0)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """_summary_

        Parameters
        ----------
        x : torch.Tensor
            input data (embedding vectors)

        Returns
        -------
        torch.Tensor
            _description_
        """
        # |x| = (batch_size, embed_dim)
        return self.mlp(x)


class DeepFM(nn.Module):
    def __init__(
        self, field_dims: List[int], embed_dim: int, mlp_dims: List[int], dropout: float
    ):
        """DeepFM model

        Parameters
        ----------
        field_dims : List[int]
            dimension of each field
        embed_dim : int
            embedding dimensions
        mlp_dims : List[int]
            _description_
        dropout : float
            _description_
        """
        super().__init__()
        self.fm_linear = FMLinear(field_dims)
        self.fm_interaction = FMInteraction()
        self.embedding = FeatureEmbedding(field_dims, embed_dim)
        self.embed_output_dim = len(field_dims) * embed_dim
        self.mlp = MLP(self.embed_output_dim, mlp_dims, dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """_summary_

        Parameters
        ----------
        x : torch.Tensor
            _description_

        Returns
        -------
        torch.Tensor
            predicted ratings
        """
        # |x| = (batch_size, num_fields)
        embed_x = self.embedding(x)
        x = (
            self.fm_linear(x)
            + self.fm_interaction(embed_x)
            + self.mlp(embed_x.view(-1, self.embed_output_dim))
        )
        out = torch.sigmoid(x.squeeze(1))
        # |out| = (batch_size, )
        return out


In [45]:
user_ids = history_meta_profile['profile_id'].unique()
item_ids = history_meta_profile['album_id'].unique()

In [46]:
# 만들고자 하는 함수
## input: history_meta_profile 같은 Dataframe, trainer.model
### trainer.model로 pure하게 접근해야할듯
### 각 유저별로 for문으로 돌리고 batch_size가 unique item의 수가 될 것
## ouput: submission.csv 형태의 Dataframe