## ベースラインモデル概要

- Forumに挙がっているコードを、Pytorchで出来る限り再現したコード
    - https://mp.weixin.qq.com/s/KBtxmYlvV1U7wimCyv9v_g?
    - https://github.com/Ai-Light/KDD2020Multimodalities/blob/master/code/%5Bimage-concat-query%5D-wwm_uncased_L12-768_v3_quart.ipynb
    

- 訓練データの画像とクエリのペアを正例、ペアをランダムに入れ替えたデータを負例として、2値分類モデルを学習するモデル
    - クエリ側：BERTとword2vecでエンコードしたものをconcat
    - 画像側：面積順にして、画特徴量と位置特徴量とクラスラベルをconcatして、BiLSTMでエンコード
    - 最後に、両方をconcatして、全結合層でつなげて、出力
    
    
    - 負例に関しては、ミニバッチごとに、ミニバッチ数×5の負例（ランダムに入れ替え）を用意



- 現在評価中だが、1epochが約5時間(GPU2枚)で、3epochで、0.5くらいな感じ
- 多分、まだ上がるはず
    - Epoch 1, train_loss: 0.21015196827693125, train_score: 0.9234113280874952, valid_score: 0.45962303517930975, time: 19286.304292201996
    - Epoch 2, train_loss: 0.18031329249707034, train_score: 0.9534390202407629, valid_score: 0.4747285933335644, time: 19173.962506055832
    - Epoch 3, train_loss: 0.17171741712500027, train_score: 0.9565376569179256, valid_score: 0.5024780766046713, time: 19172.145148277283

In [None]:
# !pip install transformers
# !pip install swifter

In [None]:
import pandas as pd
import numpy as np
import datetime
import random
import os
import time
import gc
from glob import glob
from tqdm.notebook import tqdm
import pickle
import logging
import json
import math
from collections import OrderedDict

import base64
import swifter
import gensim

from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
from sklearn.utils import shuffle
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import mean_squared_error, roc_auc_score
import matplotlib.pyplot as plt
#import japanize_matplotlib
import seaborn as sns

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, Dataset

pd.set_option('display.max_columns', 500)

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset

import keras
from keras.preprocessing.sequence import pad_sequences

import networkx as nx
import dgl
import dgl.function as fn
from dgl.nn.pytorch import GraphConv

from pytorch_metric_learning import losses

import transformers
from transformers import (
    BertTokenizer, BertModel, BertForSequenceClassification, BertConfig,
    WEIGHTS_NAME, CONFIG_NAME, AdamW, get_linear_schedule_with_warmup, 
    get_cosine_schedule_with_warmup,DistilBertTokenizer,DistilBertModel
)

In [None]:
DATA_DIR = '/home/m_fujitsuka/mnt/data/'
TRAIN_DIR = '/home/m_fujitsuka/mnt/data/'
VALID_DIR = '/home/m_fujitsuka/mnt/data/'
TESTADIR = '/home/m_fujitsuka/mnt/data/'
TESTB_DIR = '/home/m_fujitsuka/mnt/data/'

FEATURE_DIR = '/home/m_fujitsuka/mnt/fujitsuka/features/'
LOG_DIR = '/home/m_fujitsuka/mnt/fujitsuka/log/'
W2V_DIR = '/home/m_fujitsuka/mnt/fujitsuka/w2v/'
MODEL_DIR = '/home/m_fujitsuka/mnt/fujitsuka/model/'
SUBMIT_DIR = '/home/m_fujitsuka/mnt/fujitsuka/prediction_result/'

### Read Data

In [None]:
# train = pd.read_csv(TRAIN_DIR+'train.tsv', sep='\t', , quoting=csv.QUOTE_NONE)
train = pd.read_csv(TRAIN_DIR+'train.sample.tsv', sep='\t')
print(train.shape)
train.head()

In [None]:
valid = pd.read_csv(VALID_DIR+'valid.tsv', sep='\t')
print(valid.shape)
valid.head()

In [None]:
testA = pd.read_csv(TESTA_DIR+'testA.tsv', sep='\t')
print(testA.shape)
testA.head()

### Decode Data

In [None]:
def transform_boxes(df_row):
    return np.frombuffer(base64.b64decode(df_row.boxes), dtype=np.float32).reshape(df_row.num_boxes, 4)

def transform_features(df_row):
    return np.frombuffer(base64.b64decode(df_row.features), dtype=np.float32).reshape(df_row.num_boxes, 2048)

def transform_class_labels(df_row):
    return np.frombuffer(base64.b64decode(df_row.class_labels), dtype=np.int64).reshape(df_row.num_boxes)

def transform_df(df):
    df.boxes = df.swifter.apply(transform_boxes, axis=1)
    df.features = df.swifter.apply(transform_features, axis=1)
    df.class_labels = df.swifter.apply(transform_class_labels, axis=1)
    return df

In [None]:
train = transform_df(train)
valid = transform_df(valid)
testA = transform_df(testA)

In [None]:
train.head()

In [None]:
# # すでにデコードしたデータも作成済み
# # こちらだと、メモリ消費量102GBくらいで、読み込みも速い

# train = pd.read_pickle(FEATURE_DIR+'train_decode.pkl')
# valid = pd.read_pickle(FEATURE_DIR+'valid_decode.pkl')
# testA = pd.read_pickle(FEATURE_DIR+'testA_decode.pkl')

### Feature Engineering

- 正規化したボックスの座標

In [None]:
train['box_normalized'] = train.swifter.apply(lambda x: np.array([[xi[0]/x.image_h,xi[1]/x.image_w,xi[2]/x.image_h,xi[3]/x.image_w] for xi in x.boxes]), axis=1)
valid['box_normalized'] = valid.swifter.apply(lambda x: np.array([[xi[0]/x.image_h,xi[1]/x.image_w,xi[2]/x.image_h,xi[3]/x.image_w] for xi in x.boxes]), axis=1)
testA['box_normalized'] = testA.swifter.apply(lambda x: np.array([[xi[0]/x.image_h,xi[1]/x.image_w,xi[2]/x.image_h,xi[3]/x.image_w] for xi in x.boxes]), axis=1)

- 正規化されたボックスの面積

In [None]:
train['box_area_normalized'] = train.box_normalized.apply(lambda x: np.array([(xi[3]-xi[1]) * (xi[2]-xi[0]) for xi in x]))
valid['box_area_normalized'] = valid.box_normalized.apply(lambda x: np.array([(xi[3]-xi[1]) * (xi[2]-xi[0]) for xi in x]))
testA['box_area_normalized'] = testA.box_normalized.apply(lambda x: np.array([(xi[3]-xi[1]) * (xi[2]-xi[0]) for xi in x]))

- ボックスのクラスラベルをテキスト化

In [None]:
class_label_dict = {
    0:'top clothes (coat, jacket, shirt, etc.)',
    1:'skirt & dress',
    2:'bottom clothes (trousers, pants, etc.)',
    3:'luggage, leather goods',
    4:'shoes',
    5:'accessories (jewelry, clothing accessories, belts, hats, scarves, etc.)',
    6:'snacks, nuts, liquor and tea',
    7:'makeup, perfume, beauty tools and essential oils',
    8:'bottle drink',
    9:'furniture',
    10:'stationery',
    11:'household electrical appliances',
    12:'home decoration',
    13:'household fabric',
    14:'kitchenware',
    15:'home / personal cleaning tools',
    16:'storage supplies',
    17:'motorcycle, motorcycle accessories, vehicles, bicycle and riding equipment',
    18:'outdoor product',
    19:'lighting',
    20:'toys',
    21:'underwear',
    22:'digital supplies',
    23:'bed linens',
    24:'baby products',
    25:'personal care',
    26:'sporting goods',
    27:'clothes (accessories, baby clothing, etc.)',
    28:'others',
    29:'human face',
    30:'arm',
    31:'hair',
    32:'hand',
}

train['class_labels_words'] = train.class_labels.swifter.apply(lambda x: (' ').join([class_label_dict[xi] for xi in x]))
valid['class_labels_words'] = valid.class_labels.swifter.apply(lambda x: (' ').join([class_label_dict[xi] for xi in x]))
testA['class_labels_words'] = testA.class_labels.swifter.apply(lambda x: (' ').join([class_label_dict[xi] for xi in x]))

train['class_labels'] = train.class_labels.swifter.apply(lambda x: np.array([class_label_dict[xi] for xi in x]))
valid['class_labels'] = valid.class_labels.swifter.apply(lambda x: np.array([class_label_dict[xi] for xi in x]))
testA['class_labels'] = testA.class_labels.swifter.apply(lambda x: np.array([class_label_dict[xi] for xi in x]))

- 今回のテキストデータに対するtokenizer
- 学習済みword2vecの重み行列

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path, encoding='utf-8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in tqdm(f))

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    unknown_words = []
    
    for word, i in word_index.items():  # word_indexのwordに対応するembがあれば、embを代入する
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            unknown_words.append(word)
    return embedding_matrix, unknown_words


def load_word_embed(word_embed_glove=W2V_DIR+"glove.840B.300d.txt", 
                    word_embed_crawl=W2V_DIR+"crawl-300d-2M.vec",
                    save_filename=W2V_DIR+"word_embedding_matrix_testA",
                    word_index=None):
    
    # Tokneizerの学習
    Tokenizer = keras.preprocessing.text.Tokenizer(filters='', lower=False)
    Tokenizer.fit_on_texts(list(train['query'])+list(valid['query'])+list(testA['query'])
                           +list(train['class_labels_words'])+list(valid['class_labels_words'])+list(testA['class_labels_words']))
    
    if os.path.exists(save_filename + ".npy"):
        embedding_matrix = np.load(save_filename + ".npy").astype("float32")
    else:
        
        if word_index is None:
            word_index = Tokenizer.word_index
        
        glove_matrix, unknown_words_glove = build_matrix(word_index, word_embed_glove)
        print('n unknown words (glove): ', len(unknown_words_glove))
        
        crawl_matrix, unknown_words_crawl = build_matrix(word_index, word_embed_crawl)
        print('n unknown words (crawl): ', len(unknown_words_crawl))
        
        embedding_matrix = crawl_matrix + glove_matrix  
        np.save(save_filename, embedding_matrix)

        del crawl_matrix
        del glove_matrix
        gc.collect()
        
    return embedding_matrix, Tokenizer

In [None]:
# about 5 minites
embedding_matrix, tokenizer_w2v = load_word_embed()

## 上記の処理が終わっている前提で

In [None]:
# # 上記の処理済みデータは作成済み
# # メモリ消費量102GBくらいで、読み込みも速い

# train = pd.read_pickle(FEATURE_DIR+'train.pkl')
# valid = pd.read_pickle(FEATURE_DIR+'valid.pkl')
# testA = pd.read_pickle(FEATURE_DIR+'testA.pkl')

# embedding_matrix = np.load(W2V_DIR+"word_embedding_matrix_testA.npy").astype("float32")

# with open(W2V_DIR+'tokenizer_w2v.pkl', 'rb') as f:
#     tokenizer_w2v = pickle.load(f)

## モデル構築

In [None]:
# ログ作成
logger = logging.getLogger('main')
logger.setLevel(logging.DEBUG)
sc = logging.StreamHandler()
logger.addHandler(sc)
fh = logging.FileHandler(LOG_DIR+'20200515.log')
logger.addHandler(fh)

In [None]:
# シード値固定
def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(1234)

In [None]:
# GPU使うために
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
print(device)

### DatasetとDataLoaderの設定

In [None]:
MAX_LEN = 14  # クエリの最大単語数
MAX_LEN_IMG = 10  # 検出ボックスの最大数
MAX_LABEL_IMG = 8  # 検出ボックスのクラスラベルに含まれるwordの最大数

class KDDDataset(Dataset):

    def __init__(self, df, train_mode=True, transform=None):
        self.df = df
        self.bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.w2v_tokenizer = tokenizer_w2v
        #self.transform = transform
        self.train_mode = train_mode
        
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        ################################################################
        # query side
        ################################################################
        query = row['query']
        
        # BERTの入力作成
        inputs_query = self.bert_tokenizer.encode_plus(
            query,
            add_special_tokens=True,
            max_length=MAX_LEN,
        )
        
        ids_query = inputs_query["input_ids"]
        token_type_ids_query = inputs_query["token_type_ids"]
        mask_query = inputs_query["attention_mask"]
        
        padding_len = MAX_LEN - len(ids_query)
        ids_query =  ([0]*padding_len) + ids_query
        token_type_ids_query = ([0]*padding_len) + token_type_ids_query
        mask_query = ([0]*padding_len) + mask_query
            
            
        # word2vec入力作成
        query = tokenizer_w2v.texts_to_sequences([query])
        query = pad_sequences(query, maxlen=MAX_LEN, padding='pre')[0]

        
        ################################################################
        # image side
        ################################################################

        # box_areaの大きい順にソート
        box_area = row['box_area_normalized']
        box_area_ordered = list(np.argsort(box_area)[::-1])
        
        
        features_ordered = row.features[box_area_ordered, :][:MAX_LEN_IMG][:,::-1]
        box_ordered = row.box_normalized[box_area_ordered, :][:MAX_LEN_IMG][:,::-1]
        area_ordered = row.box_area_normalized[box_area_ordered][:MAX_LEN_IMG][::-1]
        class_labels_ordered = row.class_labels[box_area_ordered][:MAX_LEN_IMG][::-1]
        class_labels = []
        for cl in class_labels_ordered:
            class_tmp = tokenizer_w2v.texts_to_sequences([cl])[0][:MAX_LABEL_IMG]
            class_labels.append([0]*(MAX_LABEL_IMG-len(class_tmp)) + class_tmp)

        pad_image_len = MAX_LEN_IMG - features_ordered.shape[0]
        img_mask = ([0]*pad_image_len) + [1 for _ in range(features_ordered.shape[0])]


        features = pad_sequences([features_ordered.reshape(-1,2048)], maxlen=MAX_LEN_IMG, padding='pre')[0]
        box = pad_sequences([box_ordered.reshape(-1,4)], maxlen=MAX_LEN_IMG, padding='pre')[0]
        area = pad_sequences([area_ordered.reshape(-1,1)], maxlen=MAX_LEN_IMG, padding='pre')[0]
        class_labels = pad_sequences([class_labels], maxlen=MAX_LEN_IMG, padding='pre')[0]

        
        if self.train_mode:
            return {
                'query': torch.tensor(query, dtype=torch.long),
                'ids_query': torch.tensor(ids_query, dtype=torch.long),
                'token_type_ids_query': torch.tensor(token_type_ids_query, dtype=torch.long),
                'mask_query': torch.tensor(mask_query, dtype=torch.long),
                'features': torch.tensor(features, dtype=torch.float),
                'box': torch.tensor(box, dtype=torch.float),
                'area': torch.tensor(area, dtype=torch.float),
                'class_labels': torch.tensor(class_labels, dtype=torch.long).view(MAX_LEN_IMG, MAX_LABEL_IMG),
                'img_mask': torch.tensor(img_mask, dtype=torch.float).view(MAX_LEN_IMG,1),
            }
        else:
            # 今は何も変わっていない
            return {
                'query': torch.tensor(query, dtype=torch.long),
                'ids_query': torch.tensor(ids_query, dtype=torch.long),
                'token_type_ids_query': torch.tensor(token_type_ids_query, dtype=torch.long),
                'mask_query': torch.tensor(mask_query, dtype=torch.long),
                'features': torch.tensor(features, dtype=torch.float),
                'box': torch.tensor(box, dtype=torch.float),
                'area': torch.tensor(area, dtype=torch.float),
                'class_labels': torch.tensor(class_labels, dtype=torch.long).view(MAX_LEN_IMG, MAX_LABEL_IMG),
                'img_mask': torch.tensor(img_mask, dtype=torch.float).view(MAX_LEN_IMG,1),
            }

In [None]:
# # dataloaderの挙動確認

# dataset_train = KDDDataset(train, train_mode=True)
# train_loader = DataLoader(dataset_train, batch_size=128, shuffle=True, num_workers=0, drop_last=True)

# for i, batch in enumerate(train_loader):
#     if i<1:
#         print(batch['query'].shape)
#         print(batch['ids_query'].shape)
#         print(batch['features'].shape)
#         print(batch['box'].shape)
#         print(batch['area'].shape)
#         print(batch['class_labels'].shape)
#         print(batch['img_mask'].shape)

#     else:
#         break

## Model

In [None]:
class KDDModel(nn.Module):
    def __init__(self):
        super(KDDModel, self).__init__()
        self.model_name = 'KDDModel'
        
        ######################################
        # query 
        ######################################
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')

        self.max_feature = embedding_matrix.shape[0]
        self.embedding_size = embedding_matrix.shape[1]
        self.embedding = nn.Embedding(self.max_feature, self.embedding_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.required_grad = True  # こっちもtrainableに

        self.lstm_q = nn.LSTM(self.embedding_size, hidden_size=64, num_layers=1, bidirectional=True, batch_first=True)

        
        
        ######################################
        # image
        ######################################
        self.conv1d = nn.Conv1d(in_channels=300, out_channels=128, kernel_size=3)

        self.fc_i1 = nn.Linear(2048+128, 512)
        self.fc_i2 = nn.Linear(5, 512)

        self.lstm_i = nn.LSTM(512+512, hidden_size=512, num_layers=1, bidirectional=True, batch_first=True)


        self.fc1 = nn.Linear(768+64*2+512*2, 2048)
        self.fc2 = nn.Linear(2048, 512)
        self.fc3 = nn.Linear(512, 128)
        self.fc4 = nn.Linear(128, 2)

    def forward(self, query, ids, token_type_ids, mask, features, box, area, class_labels, img_mask):

        ######################################
        # query 
        ######################################
        layers, pool_out = self.bert_model(input_ids=ids, token_type_ids=token_type_ids, attention_mask=mask)
        layers = layers[:,-1,:]
       
        out_query = self.embedding(query).squeeze()
        out_query, _  = self.lstm_q(out_query)
        out_query = out_query[:,-1,:]  # bi-directionalなので、hidden_size*2
        
        out_query = torch.cat([layers, out_query], dim=-1)  # (batch, 768+128)


        ######################################
        # image
        ######################################
        class_labels = self.embedding(class_labels)
        out_class_labels = torch.zeros((class_labels.shape[0], class_labels.shape[1], 128), requires_grad=True).to(device)
        for l in range(class_labels.shape[1]):
            out_class_labels[:,l,:] = F.max_pool1d(self.conv1d(class_labels[:,l,:,:].permute(0,2,1)), kernel_size=6).squeeze()
        

        out_image = torch.cat([out_class_labels, features], dim=-1)
        out_image = F.relu(self.fc_i1(out_image))
        out_image = out_image * img_mask

        out_pos = torch.cat([box, area], dim=-1)
        out_pos = out_pos * img_mask
        out_pos = F.relu(self.fc_i2(out_pos))
        
        out_image = torch.cat([out_image, out_pos], dim=-1)
        out_image, _ = self.lstm_i(out_image)  # bi-directionalなので、output_dimは*2
        out_image = out_image * img_mask
        out_image = out_image[:,-1,:]

        
        ######################################
        # 結合
        ######################################
        out = torch.cat([out_query, out_image], dim=-1)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = F.relu(self.fc3(out))
        out = F.relu(self.fc4(out))
        
        return out

### エポックごとバッチごとの学習・評価処理

In [None]:
def train_model(train_loader, model, optimizer, criterion): #, scheduler):
    model.train()
    avg_loss = 0.
    avg_score = 0.
    for idx, batch in enumerate(tqdm(train_loader)):
        batch_size = batch['query'].shape[0]

        query = batch['query'].to(device)
        ids_query = batch['ids_query'].to(device)
        token_type_ids_query = batch['token_type_ids_query'].to(device)
        mask_query = batch['mask_query'].to(device)
        features = batch['features'].to(device)
        box = batch['box'].to(device)
        area = batch['area'].to(device)
        class_labels = batch['class_labels'].to(device)
        img_mask = batch['img_mask'].to(device)
        
        
        ####################################################################
        # Negative sample作成
        # ここでは、バッチサイズの正例に、バッチサイズ*5の負例を加えている
        ####################################################################
        k = 5
        for num in range(k):
            idx_neg = random.sample(range(batch_size), k=batch_size)
            #idx_neg = random.choices(range(batch_size), k=batch_size)
            
            query = torch.cat([query, batch['query'].to(device)], dim=0)
            ids_query = torch.cat([ids_query, batch['ids_query'].to(device)], dim=0)
            token_type_ids_query = torch.cat([token_type_ids_query, batch['token_type_ids_query'].to(device)], dim=0)
            mask_query = torch.cat([mask_query, batch['mask_query'].to(device)], dim=0)
            
            features = torch.cat([features, batch['features'][idx_neg, :].to(device)])
            box = torch.cat([box, batch['box'][idx_neg, :].to(device)], dim=0)
            area = torch.cat([area, batch['area'][idx_neg, :].to(device)], dim=0)
            class_labels = torch.cat([class_labels, batch['class_labels'][idx_neg, :].to(device)], dim=0)
            img_mask = torch.cat([img_mask, batch['img_mask'][idx_neg, :].to(device)], dim=0)


        labels = torch.cat([torch.tensor(np.ones(batch_size), dtype=torch.long), torch.tensor(np.zeros(batch_size*k), dtype=torch.long)], dim=0).to(device)
        

        out = model(query, ids_query, token_type_ids_query, mask_query, features, box, area, class_labels, img_mask)
        loss = criterion(out, labels)

        score = roc_auc_score(labels.detach().cpu().numpy(), out[:,1].detach().cpu().numpy())
        print('roc_auc', score)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        avg_loss += loss.item() / len(train_loader)
        avg_score += score / len(train_loader)

        del out,loss

    torch.cuda.empty_cache()
    gc.collect()
    return avg_loss, avg_score



def test_model(test_loader, model):
    model.eval()
    
    out_list = []   
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(test_loader)):
            query = batch['query'].to(device)
            ids_query = batch['ids_query'].to(device)
            token_type_ids_query = batch['token_type_ids_query'].to(device)
            mask_query = batch['mask_query'].to(device)
            features = batch['features'].to(device)
            box = batch['box'].to(device)
            area = batch['area'].to(device)
            class_labels = batch['class_labels'].to(device)
            img_mask = batch['img_mask'].to(device)

            out = F.softmax(model(query, ids_query, token_type_ids_query, mask_query, features, box, area, class_labels, img_mask), dim=-1)[:,1]

            out_list.append(out)
            del out
        
    torch.cuda.empty_cache()
    gc.collect()
    return out_list

### n-DCG@5の評価コード
- Forumの公式のコードを流用

In [None]:
# compute dcg@k for a single sample
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(3, r.size + 2)))
    return 0.


# compute ndcg@k (dcg@k / idcg@k) for a single sample
def get_ndcg(r, ref, k):
    dcg_max = dcg_at_k(ref, k)
    if not dcg_max:
        return 0.
    dcg = dcg_at_k(r, k)
    return dcg / dcg_max


def make_score(df_test, out_list):
    for i, out in enumerate(out_list):
        if i==0:
            out_array = out.squeeze()
        else:
            out_array = torch.cat([out_array, out.squeeze()], dim=-1)

    out_array = out_array.cpu().detach().numpy()
    df_test['score'] = out_array
    
    # sort and group by
    df_test = df_test.sort_values(by="score", ascending=False)
    grouped = df_test.groupby("query_id").head(5)
    
    predictions = {}
    for i, q_id in enumerate(tqdm(grouped['query_id'].unique())):
        predictions[f'{q_id}'] = grouped.loc[(grouped['query_id']==q_id), 'product_id'].values.astype(str)
    
    
    # read ground-truth
    reference = json.load(open(VALID_DIR+'valid_answer.json'))

    # compute score for each query
    k = 5
    score_sum = 0.
    for qid in reference.keys():
        ground_truth_ids = set([str(pid) for pid in reference[qid]])
        ref_vec = [1.0] * len(ground_truth_ids)
        pred_vec = [1.0 if pid in ground_truth_ids else 0.0 for pid in predictions[qid]]
        score_sum += get_ndcg(pred_vec, ref_vec, k)
    # the higher score, the better
    score = score_sum / len(reference)
    
    return score

### 実際に学習

In [None]:
model = KDDModel().to(device)
# model = nn.DataParallel(model) # これでGPU複数並列処理可能


optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()


loss_list_epoch_train = []
score_list_epoch_valid = []
score_best = 0
patience = 0
EPOCHS = 20
for epoch in range(EPOCHS):

    torch.cuda.empty_cache()
    start_time   = time.time()

    dataset_train = KDDDataset(train, train_mode=True)
    dataset_valid = KDDDataset(valid, train_mode=False)
    train_loader = DataLoader(dataset_train, batch_size=128, shuffle=True, num_workers=0, drop_last=True)
    valid_loader = DataLoader(dataset_valid, batch_size=128, shuffle=False, num_workers=0, drop_last=False)
    #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda = lambda epoch: 0.95 ** epoch)
    #scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.01, steps_per_epoch=len(train_loader), epochs=EPOCHS)

    loss_train, score_train = train_model(train_loader, model, optimizer, criterion) #, scheduler)
    #scheduler.step()
    out_valid = test_model(valid_loader, model)
    score_valid = make_score(valid, out_valid)

    loss_list_epoch_train.append(loss_train)
    score_list_epoch_valid.append(score_valid)

    logger.info(f'Epoch {(epoch+1)}, train_loss: {loss_train}, train_score: {score_train}, valid_score: {score_valid}, time: {(time.time()-start_time)}')

    
    # Eearly Stopping
    if score_valid > score_best:
        score_best = score_valid
        best_param = model.state_dict()
        torch.save(best_param, MODEL_DIR+f'best_param.pt')
        patience = 0
    else:
        patience += 1
        if patience >= 3:
            del train_loader, valid_loader, loss_train, out_valid
            torch.cuda.empty_cache()
            gc.collect()
            break

    del train_loader, valid_loader, loss_train, out_valid
    torch.cuda.empty_cache()
    gc.collect()

## 学習モデルを用いて提出ファイル作成

In [None]:
model = KDDModel().to(device)
model.load_state_dict(torch.load(MODEL_DIR+f'best_param.pt'))

In [None]:
# # Mutiple GPU使った場合

# def fix_model_state_dict(state_dict):
#     new_state_dict = OrderedDict()
#     for k, v in state_dict.items():
#         name = k
#         if name.startswith('module.'):
#             name = name[7:]  # remove 'module.' of dataparallel
#         new_state_dict[name] = v
#     return new_state_dict

# model = KDDModel().to(device)
# model.load_state_dict(fix_model_state_dict(torch.load(MODEL_DIR+f'best_param.pt')))

In [None]:
def make_submit_df(df_test, model):
    dataset_test = KDDDataset(df_test, train_mode=False)
    test_loader = DataLoader(dataset_test, batch_size=128, shuffle=False, num_workers=0, drop_last=False)
    out_list = test_model(test_loader, model)

    for i, out in enumerate(out_list):
        if i==0:
            out_array = out.squeeze()
        else:
            out_array = torch.cat([out_array, out.squeeze()], dim=-1)

    out_array = out_array.cpu().detach().numpy()
    df_test['score'] = out_array
    
    # sort and group by
    df_test = df_test.sort_values(by="score", ascending=False)
    grouped = df_test.groupby("query_id").head(5)
    
    submit_df = pd.DataFrame(columns=['query-id','product1','product2','product3','product4','product5'])
    for i, q_id in enumerate(tqdm(grouped['query_id'].unique())):
        submit_df.loc[i, 'query-id'] = q_id
        submit_df.iloc[i, 1:] = grouped.loc[(grouped['query_id']==q_id), 'product_id'].values
    submit_df = submit_df.astype(int)
    submit_df = submit_df.sort_values(by='query-id')
    submit_df = submit_df.reset_index(drop=True)
    
    return submit_df

In [None]:
submit_df = make_submit_df(testA, model)
submit_df.to_csv(SUBMIT_DIR+'submission.csv', index=False)