In [0]:
from google.colab import drive
drive.mount('/content/drive')

## やったこと

- テキストのクリーニング処理 -> 改善
- host, categoryのカテゴリカル変数をエンベディングして入力 -> 改善

- epochs=20で、early-stoppingはあまり良くならなかった -> とりあえず速く数を回したいので、epochs=4でやっている


- 今は、各ラベルをBinaryCrossEntropyでフィッティング(MSEは良くなかった)
- 



## 試してみること

- 2倍にデータを水増し


## だめだったこと：

- batch_size=8以上にすると、メモリ不足になる
- MSELossを使用 -> 悪化
- titleは分けて、別のエンベディングとして入力 -> 悪化



- BERTを2つ使う -> gpu不足
- クラス分類問題にする（30*num_class） -> 学習が安定しない（nan）
- 30個の目的変数それぞれ独立に予測するモデル -> 約30時間必要、あまり精度が出ないように見える -> 関連する目的変数だけをグルーピングしてモデルを分ける必要？

## 試してみること
- 文章に関わる統計的特徴量を追加してみる！

In [0]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os, sys, gc, random, multiprocessing, glob, time
pd.set_option('display.max_columns', 200)

DATA_DIR = '/content/drive/My Drive/Colab Notebooks/GoogleQuest/input/google-quest-challenge'
# DATA_DIR = '../input/google-quest-challenge'
# DATA_DIR = 'D:/project/ICF_AutoCapsule_disabled/kaggle/google-quest-challenge'
# BERT_DIR = 'D:/project/ICF_AutoCapsule_disabled/BERT'

In [0]:
# !pip install ../input/sacremoses/sacremoses-master/
# !pip install ../input/transformers/transformers-master/

In [0]:
!pip install transformers
!pip install nlpaug
!pip install flashtext

In [0]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset

#from ml_stratifiers import MultilabelStratifiedShuffleSplit, MultilabelStratifiedKFold
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder

from scipy.stats import spearmanr

import transformers
from transformers import (
    BertTokenizer, BertModel, BertForSequenceClassification, BertConfig,
    WEIGHTS_NAME, CONFIG_NAME, AdamW, get_linear_schedule_with_warmup, 
    get_cosine_schedule_with_warmup,
)

from tqdm import tqdm
print(transformers.__version__)

In [0]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

In [0]:
## Make results reproducible .Else noone will believe you .
import random

def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [0]:
class PipeLineConfig:
    def __init__(self, lr, warmup, epochs, patience, batch_size, seed, name, question_weight,answer_weight,fold,train):
        self.lr = lr
        self.warmup = warmup
        self.epochs = epochs
        self.patience = patience
        self.batch_size = batch_size
        self.seed = seed
        self.name = name
        self.question_weight = question_weight
        self.answer_weight =answer_weight
        self.fold = fold
        self.train = train

In [0]:
config = PipeLineConfig(lr=1e-5, \
                        warmup=0.01, \
                        epochs=4, \
                        patience=3, \
                        batch_size=8, \
                        seed=42, \
                        name='reModel_question', \
                        question_weight=0.5, \
                        answer_weight=0.5, \
                        fold=5, \
                        train=True
                       )

In [0]:
seed_everything(config.seed)

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
print(device)

In [0]:
sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
sub.head()

In [0]:
target_columns = sub.columns.values[1:].tolist()
target_columns

In [0]:
train = pd.read_csv(f'{DATA_DIR}/train.csv')
train.head()

In [0]:
test = pd.read_csv(f'{DATA_DIR}/test.csv')
test.head()

## Statistical Features

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [0]:
# tfidf = TfidfVectorizer(ngram_range=(1, 3))
# tsvd = TruncatedSVD(n_components=32, n_iter=5)

# tfquestion_title = tfidf.fit_transform(train["question_title"].values)
# tfquestion_title_test = tfidf.transform(test["question_title"].values)
# tfquestion_title = tsvd.fit_transform(tfquestion_title)
# tfquestion_title_test = tsvd.transform(tfquestion_title_test)

# tfquestion_body = tfidf.fit_transform(train["question_body"].values)
# tfquestion_body_test = tfidf.transform(test["question_body"].values)
# tfquestion_body = tsvd.fit_transform(tfquestion_body)
# tfquestion_body_test = tsvd.transform(tfquestion_body_test)

# tfanswer = tfidf.fit_transform(train["answer"].values)
# tfanswer_test = tfidf.transform(test["answer"].values)
# tfanswer = tsvd.fit_transform(tfanswer)
# tfanswer_test = tsvd.transform(tfanswer_test)

In [0]:
# train['tfidf_title'] = list(tfquestion_title)
# train['tfidf_title'][0]

In [0]:
# 質問, 回答を書いているユーザの投稿数
num_question_users = train.question_user_page.value_counts().to_dict()
num_answer_users = train.answer_user_page.value_counts().to_dict()

In [0]:
train[test.columns].head()

In [0]:
train.answer_user_page.value_counts()

In [0]:
question_heads = ['What', 'When', 'Where', 'Who', 'Whose', 'Which', 'Why', 'Can', 'How', 'Do', 'Does', 'Could', 'Would']
tf_title_head = train.question_title.apply(lambda x: x.split()[0] in question_heads)
tf_title_head

## Preprocessing

In [0]:
import re
from flashtext import KeywordProcessor

In [0]:
PUNCTS = {
            '》', '〞', '¢', '‹', '╦', '║', '♪', 'Ø', '╩', '\\', '★', '＋', 'ï', '<', '?', '％', '+', '„', 'α', '*', '〰', '｟', '¹', '●', '〗', ']', '▾', '■', '〙', '↓', '´', '【', 'ᴵ',
            '"', '）', '｀', '│', '¤', '²', '‡', '¿', '–', '」', '╔', '〾', '%', '¾', '←', '〔', '＿', '’', '-', ':', '‧', '｛', 'β', '（', '─', 'à', 'â', '､', '•', '；', '☆', '／', 'π',
            'é', '╗', '＾', '▪', ',', '►', '/', '〚', '¶', '♦', '™', '}', '″', '＂', '『', '▬', '±', '«', '“', '÷', '×', '^', '!', '╣', '▲', '・', '░', '′', '〝', '‛', '√', ';', '】', '▼',
            '.', '~', '`', '。', 'ə', '］', '，', '{', '～', '！', '†', '‘', '﹏', '═', '｣', '〕', '〜', '＼', '▒', '＄', '♥', '〛', '≤', '∞', '_', '[', '＆', '→', '»', '－', '＝', '§', '⋅', 
            '▓', '&', 'Â', '＞', '〃', '|', '¦', '—', '╚', '〖', '―', '¸', '³', '®', '｠', '¨', '‟', '＊', '£', '#', 'Ã', "'", '▀', '·', '？', '、', '█', '”', '＃', '⊕', '=', '〟', '½', '』',
            '［', '$', ')', 'θ', '@', '›', '＠', '｝', '¬', '…', '¼', '：', '¥', '❤', '€', '−', '＜', '(', '〘', '▄', '＇', '>', '₤', '₹', '∅', 'è', '〿', '「', '©', '｢', '∙', '°', '｜', '¡', 
            '↑', 'º', '¯', '♫', '#'
          }


mispell_dict = {"aren't" : "are not", "can't" : "cannot", "couldn't" : "could not",
"couldnt" : "could not", "didn't" : "did not", "doesn't" : "does not",
"doesnt" : "does not", "don't" : "do not", "hadn't" : "had not", "hasn't" : "has not",
"haven't" : "have not", "havent" : "have not", "he'd" : "he would", "he'll" : "he will", "he's" : "he is", "i'd" : "I would",
"i'd" : "I had", "i'll" : "I will", "i'm" : "I am", "isn't" : "is not", "it's" : "it is",
"it'll":"it will", "i've" : "I have", "let's" : "let us", "mightn't" : "might not", "mustn't" : "must not", 
"shan't" : "shall not", "she'd" : "she would", "she'll" : "she will", "she's" : "she is", "shouldn't" : "should not", "shouldnt" : "should not",
"that's" : "that is", "thats" : "that is", "there's" : "there is", "theres" : "there is", "they'd" : "they would", "they'll" : "they will",
"they're" : "they are", "theyre":  "they are", "they've" : "they have", "we'd" : "we would", "we're" : "we are", "weren't" : "were not",
"we've" : "we have", "what'll" : "what will", "what're" : "what are", "what's" : "what is", "what've" : "what have", "where's" : "where is",
"who'd" : "who would", "who'll" : "who will", "who're" : "who are", "who's" : "who is", "who've" : "who have", "won't" : "will not", "wouldn't" : "would not", "you'd" : "you would",
"you'll" : "you will", "you're" : "you are", "you've" : "you have", "'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not", "tryin'":"trying"}


kp = KeywordProcessor(case_sensitive=True)
for k, v in mispell_dict.items():
    kp.add_keyword(k, v)

def clean_punct(text):
    text = str(text)
    for punct in PUNCTS:
        text = text.replace(punct, ' {} '.format(punct))
    return text


def preprocessing(text):
    text = text.lower()
    text = re.sub(r'(\&lt)|(\&gt)', ' ', text)
    
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' url ', text)
    text = kp.replace_keywords(text)
    text = clean_punct(text)
    text = re.sub(r'\n\r', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

In [0]:
category_list = ['CULTURE', 'LIFE_ARTS', 'SCIENCE', 'STACKOVERFLOW', 'TECHNOLOGY']

host_list = ['academia.stackexchange.com', 'android.stackexchange.com',
       'anime.stackexchange.com', 'apple.stackexchange.com',
       'askubuntu.com', 'bicycles.stackexchange.com',
       'biology.stackexchange.com', 'blender.stackexchange.com',
       'boardgames.stackexchange.com', 'chemistry.stackexchange.com',
       'christianity.stackexchange.com', 'codereview.stackexchange.com',
       'cooking.stackexchange.com', 'crypto.stackexchange.com',
       'cs.stackexchange.com', 'dba.stackexchange.com',
       'diy.stackexchange.com', 'drupal.stackexchange.com',
       'dsp.stackexchange.com', 'electronics.stackexchange.com',
       'ell.stackexchange.com', 'english.stackexchange.com',
       'expressionengine.stackexchange.com', 'gamedev.stackexchange.com',
       'gaming.stackexchange.com', 'gis.stackexchange.com',
       'graphicdesign.stackexchange.com', 'judaism.stackexchange.com',
       'magento.stackexchange.com', 'math.stackexchange.com',
       'mathematica.stackexchange.com', 'mathoverflow.net',
       'mechanics.stackexchange.com', 'meta.askubuntu.com',
       'meta.christianity.stackexchange.com',
       'meta.codereview.stackexchange.com', 'meta.math.stackexchange.com',
       'meta.stackexchange.com', 'meta.superuser.com',
       'money.stackexchange.com', 'movies.stackexchange.com',
       'music.stackexchange.com', 'photo.stackexchange.com',
       'physics.stackexchange.com', 'programmers.stackexchange.com',
       'raspberrypi.stackexchange.com', 'robotics.stackexchange.com',
       'rpg.stackexchange.com', 'salesforce.stackexchange.com',
       'scifi.stackexchange.com', 'security.stackexchange.com',
       'serverfault.com', 'sharepoint.stackexchange.com',
       'softwarerecs.stackexchange.com', 'stackoverflow.com',
       'stats.stackexchange.com', 'superuser.com',
       'tex.stackexchange.com', 'travel.stackexchange.com',
       'unix.stackexchange.com', 'ux.stackexchange.com',
       'webapps.stackexchange.com', 'webmasters.stackexchange.com',
       'wordpress.stackexchange.com']

## 前処理関数

In [0]:
def preprocessing_df(df, train=True, vectolizer=[]):
  ################
  # cleaning
  ################
  df['question_title'] = df['question_title'].apply(lambda x : preprocessing(x))
  df['question_body'] = df['question_body'].apply(lambda x : preprocessing(x))
  df['answer'] = df['answer'].apply(lambda x : preprocessing(x))
  
  ################
  # label encode
  ################
  le_category = LabelEncoder()
  le_category.fit(category_list)
  for c in set(df.category):
      if c not in category_list:
          df.category = df.category.replace(c, np.nan)
          df.category = df.category.fillna(train.category.mode()[0])
  df.category = le_category.transform(df.category)

  
  le_host = LabelEncoder()
  le_host.fit(host_list)
  for c in set(df.host):
      if c not in host_list:
          df.host = df.host.replace(c, np.nan)
          df.host = df.host.fillna(train.host.mode()[0])
  df.host = le_host.transform(df.host)

  # # 疑問詞
  # question_heads = ['What', 'When', 'Where', 'Who', 'Whose', 'Which', 'Why', 'Can', 'How', 'Do', 'Does', 'Could', 'Would']
  # df['tf_title_head'] = df.question_title.apply(lambda x: int(1) if x.split()[0] in question_heads else int(0))
  

  # # 文長
  # df['len_title'] = df.question_title.apply(lambda x: len(x.split(' ')))
  # df['len_question'] = df.question_body.apply(lambda x: len(x.split(' ')))
  # df['len_answer'] = df.answer.apply(lambda x: len(x.split(' ')))


  return df

In [0]:
# train = preprocessing_df(train)
# test = preprocessing_df(test)

In [0]:
# tfidf_title = TfidfVectorizer(ngram_range=(1, 3))
# tsvd_title = TruncatedSVD(n_components=10, n_iter=5)

# tfquestion_title = tfidf_title.fit_transform(train["question_title"].values)
# tfquestion_title_test = tfidf_title.transform(test["question_title"].values)
# train["tfidf_question_title"] = list(tsvd_title.fit_transform(tfquestion_title))
# test["tfidf_question_title"] = list(tsvd_title.transform(tfquestion_title_test))

# tfidf_body = TfidfVectorizer(ngram_range=(1, 3))
# tsvd_body = TruncatedSVD(n_components=32, n_iter=5)

# tfquestion_body = tfidf_body.fit_transform(train["question_body"].values)
# tfquestion_body_test = tfidf_body.transform(test["question_body"].values)
# train["tfidf_question_body"] = list(tsvd_body.fit_transform(tfquestion_body))
# test["tfidf_question_body"] = list(tsvd_body.transform(tfquestion_body_test))

# tfidf_answer = TfidfVectorizer(ngram_range=(1, 3))
# tsvd_answer = TruncatedSVD(n_components=32, n_iter=5)

# tfanswer = tfidf_answer.fit_transform(train["answer"].values)
# tfanswer_test = tfidf_answer.transform(test["answer"].values)
# train["tfidf_answer"] = list(tsvd_answer.fit_transform(tfanswer))
# test["tfidf_answer"] = list(tsvd_answer.transform(tfanswer_test))

In [0]:
# # 質問, 回答を書いているユーザの投稿数
# 結局、テストデータでは、ほとんど新規ユーザーの質問と投稿になっているっぽい

# num_question_users = train.question_user_page.value_counts().to_dict()
# num_answer_users = train.answer_user_page.value_counts().to_dict()

# train['num_question_users'] = train.question_user_page.replace(num_question_users)
# train['num_answer_users'] = train.answer_user_page.replace(num_answer_users)

# def num_writing_questions(x):
#   if x in list(num_question_users.keys()):
#     return num_question_users[x]
#   else:
#     return 0

# def num_writing_answers(x):
#   if x in list(num_answer_users.keys()):
#     return num_answer_users[x]
#   else:
#     return 0

# test['num_question_users'] = test.question_user_page.apply(num_writing_questions)
# test['num_answer_users'] = test.answer_user_page.apply(num_writing_answers)

## DataAugmentation

In [0]:
#BERT Augmentator
aug_bert = naw.ContextualWordEmbsAug(
    #model_path=BERT_DIR+'/bert-base-uncased',
    model_path='bert-base-uncased',
    device='cuda',
    action='insert', #"substitute"
    aug_p=0.3, # 含まれているwordsの中の何割を変換するか
    temperature=1 , # 変換を施すかどうかの確率
    top_k=10)

# text = 'The quick brown fox jumps over the lazy dog .'
# print("original: \n", text)
# for _ in range(10):
#     augmented_text = aug_bert.augment(text)
#     print(augmented_text)

In [0]:
#train.question_title.map(aug_bert.augment)

## Dataset

In [0]:
512-(29+400+4)

In [0]:
MAX_LEN = 512
MAX_T_LEN = 29
MAX_Q_LEN = 400
MAX_A_LEN = 79
assert MAX_T_LEN+MAX_Q_LEN+MAX_A_LEN+4==MAX_LEN
SEP_TOKEN_ID = 102 # bert-base-uncasedにおけるvocabの'[SEP']が、102番目という意味

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, train_mode=True, labeled=True):
        self.df = df
        self.train_mode = train_mode
        self.labeled = labeled
        #self.tokenizer = BertTokenizer.from_pretrained(BERT_DIR+'/bert-base-uncased')
        #self.tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased/')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __getitem__(self, index):
        """
        token_id列
        segment_id列
        label列
        """
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, torch.tensor(row.category), torch.tensor(row.host), labels
        else:
            return token_ids, seg_ids, torch.tensor(row.category), torch.tensor(row.host)

    def __len__(self):
        return len(self.df)


#     def select_tokens(self, tokens, max_num):
#         if len(tokens) <= max_num:
#             return tokens
#         if self.train_mode:
#             num_remove = len(tokens) - max_num
#             remove_start = random.randint(0, len(tokens)-num_remove-1)
#             return tokens[:remove_start] + tokens[remove_start + num_remove:]
#         else:
#             return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]

    def trim_input(self, title, question, answer, max_sequence_length=MAX_LEN, 
                t_max_len=MAX_T_LEN, q_max_len=MAX_Q_LEN, a_max_len=MAX_A_LEN):
        """
        title. question, answerそれぞれのセンテンスを、tokenizeする
        max_lengthに足りない分は、
        """
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)
        a = self.tokenizer.tokenize(answer)

        t_len = len(t)
        q_len = len(q)
        a_len = len(a)

        if (t_len+q_len+a_len+4) > max_sequence_length:

            if t_max_len > t_len:
                """
                titleが短い場合、
                最大長に足りない長さを、半分ずつqとaに加える
                """
                t_new_len = t_len
                a_max_len = a_max_len + math.floor((t_max_len - t_len)/2) # 切り捨て
                q_max_len = q_max_len + math.ceil((t_max_len - t_len)/2) # 切り上げ
            else:
                """
                titleが長い場合、最大長で切る
                """
                t_new_len = t_max_len

            if a_max_len > a_len:
                """
                answerに加えても短い場合、
                最大長に足りない長さを、qに加える
                """
                a_new_len = a_len 
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len


            if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

            t = t[:t_new_len]
            q = q[:q_new_len]
            a = a[:a_new_len]

        return t, q, a
        
    def get_token_ids(self, row):
        if self.train_mode:
          if random.random() > 1/5:
            question_title = row.question_title
            question_body = row.question_body
            answer = row.answer
          else:
            try:
              question_title = aug_bert.augment(row.question_title)
            except:
              question_title = row.question_title
            try:
              question_body = aug_bert.augment(row.question_body)
            except:
              question_body = row.question_body
            try:
              answer = aug_bert.augment(row.answer)
            except:         
              answer = row.answer
        else:
          question_title = row.question_title
          question_body = row.question_body
          answer = row.answer

        t_tokens, q_tokens, a_tokens = self.trim_input(question_title, question_body, answer)
        
        # BERTの入力タイプに変換([CLS]と[SEP]をつないで、１つのsetentenceに)
        tokens = ['[CLS]'] + t_tokens + ['[SEP]'] + q_tokens + ['[SEP]'] + a_tokens + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < MAX_LEN:
            """0で後ろからpadding"""
            token_ids += [0] * (MAX_LEN - len(token_ids))
        ids = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(ids)  # segment_embを区別するindex
        return ids, seg_ids
    
    def get_seg_ids(self, ids):
        """
        いくつめの文かを区別するsegment_idを、各文字に振る
        """
        seg_ids = torch.zeros_like(ids) # [max_len]のtorch_tensor
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID: # [SEP]の場合
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)  # bert-base_uncasedのvocabで、[PAD]は0番目であるので、PADの部分のindexだけ抽出
        seg_ids[pad_idx] = 0

        return seg_ids

    def get_label(self, row):
        #print(row[target_columns].values)
        return torch.tensor(row[target_columns].values.astype(np.float32))

    def collate_fn(self, batch):
        """
        labelデータを持つモードと、ない完全な推論モードでは、batchのshapeが異なるので(labelが2番目の要素にあるなし)
        """
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        category = torch.stack([x[2] for x in batch])
        host = torch.stack([x[3] for x in batch])

        if self.labeled:
            labels = torch.stack([x[-1] for x in batch])
            return token_ids, seg_ids, category, host, labels
        else:
            return token_ids, seg_ids, category, host

In [0]:
def get_train_val_loaders(batch_size=4, val_batch_size=4, ifold=0):
    df = pd.read_csv(f'{DATA_DIR}/train.csv')

    df = preprocessing_df(df)

    df = shuffle(df, random_state=1234)
    gkf = GroupKFold(n_splits=5).split(X=df.question_body, groups=df.question_body)
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        if fold == ifold:
            df_train = df.iloc[train_idx]
            df_val = df.iloc[valid_idx]
            break

    print('train', df_train.shape)
    print('val', df_val.shape)

    ds_train = QuestDataset(df_train, train_mode=True)
    train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=ds_train.collate_fn, drop_last=True)
    train_loader.num = len(df_train)

    ds_val = QuestDataset(df_val, train_mode=False)
    val_loader = torch.utils.data.DataLoader(ds_val, batch_size=val_batch_size, shuffle=False, num_workers=0, collate_fn=ds_val.collate_fn, drop_last=False)
    val_loader.num = len(df_val)
    val_loader.df = df_val

    return train_loader, val_loader, df_val.shape[0], valid_idx


def get_test_loader(batch_size=4):
    df = pd.read_csv(f'{DATA_DIR}/test.csv')

    df = preprocessing_df(df)

    ds_test = QuestDataset(df, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    
    return loader

In [0]:
class QuestModel(nn.Module):
    def __init__(self, n_classes=30):
        super(QuestModel, self).__init__()
        self.model_name = 'QuestModel'
        #self.bert_model = BertModel.from_pretrained(BERT_DIR+'/bert-base-uncased/')
        #self.bert_model = BertModel.from_pretrained('../input/bert-base-uncased/')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        
        self.emb_category = nn.Embedding(len(category_list), 5)
        self.emb_host = nn.Embedding(len(host_list), 10)
        
        self.fc_out = nn.Linear(768+15, n_classes)

    def forward(self, ids, seg_ids, category, host):
        
        # print(host.size())
        # print(tf_question.size())

        attention_mask = (ids > 0)  # ids==0([PAD])部分だけFalseとなるので、そこだけattention_weightを0に
        layers, pool_out = self.bert_model(input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)
        #print(layers.size())  # (batch_size,sequence_length, 768)
        #print(pool_out.size())  # (batch_size, 768), first token of last layerをいじったもの
        
        # out = F.avg_pool1d(layers.transpose(1,2), kernel_size=layers.size()[1]).squeeze()  # sequence方向は中央値だけ抽出
        out = F.max_pool1d(layers.transpose(1,2), kernel_size=layers.size()[1]).squeeze()
        # out = torch.cat([out1, out2], dim=-1)
        # out = self.fc_bert(out)
        out = F.dropout(out, p=0.2, training=self.training)

        emb_category = self.emb_category(category)
        emb_host = self.emb_host(host)
        out_other = torch.cat([emb_category, emb_host], dim=-1)
        out_other = F.dropout(out_other, p=0.2, training=self.training)
        
        out = torch.cat([out, out_other], dim=-1)
        
        logit = self.fc_out(out)
        #logit =  F.dropout(logit, p=0.2, training=self.training)
        return logit # 単に30種類の出力値を算出
    

In [0]:
def train_model(train_loader, optimizer, criterion, scheduler):
    model.train()
    avg_loss = 0.    
    for idx, batch in enumerate(tqdm(train_loader)):
        ids, seg_ids, category, host, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device), batch[4].to(device)
        
        # print(host.size())
        # print(tf_question.size())
        # print(tf_question)
        
        logits = model(ids, seg_ids, category, host)
        #logits = torch.sigmoid(model(ids_train, seg_ids_train))
        
        loss = config.question_weight*criterion(logits[:,0:21], labels[:,0:21]) + config.answer_weight*criterion(logits[:,21:30], labels[:,21:30])
        
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        avg_loss += loss.item() / len(train_loader)
        del ids, seg_ids, category, host, labels

    torch.cuda.empty_cache()
    gc.collect()
    return avg_loss

def val_model(val_loader, val_length, batch_size=8):
    model.eval() # eval mode  
    avg_val_loss = 0.
    
    valid_preds = np.zeros((val_length, len(target_columns)))
    original = np.zeros((val_length, len(target_columns)))
    
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(val_loader)):
            ids, seg_ids, category, host, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device), batch[4].to(device)
            
            logits = torch.sigmoid(model(ids, seg_ids, category, host))
            
            avg_val_loss += criterion(logits, labels).item() / len(val_loader)
            valid_preds[idx*batch_size : (idx+1)*batch_size] = logits.detach().cpu().squeeze().numpy()
            original[idx*batch_size : (idx+1)*batch_size]    = labels.detach().cpu().squeeze().numpy()

            del ids, seg_ids, category, host, labels

        score = 0
        preds = torch.tensor(valid_preds).numpy()
        #preds = torch.sigmoid(torch.tensor(valid_preds)).numpy()
        
        rho_val = np.mean([spearmanr(original[:, i], preds[:,i]).correlation for i in range(preds.shape[1])])
        print('\r val_spearman-rho: %s' % (str(round(rho_val, 5))), end = 100*' '+'\n')
        
        for i in range(len(target_columns)):
            print(i, spearmanr(original[:,i], preds[:,i]))
            score += np.nan_to_num(spearmanr(original[:, i], preds[:, i]).correlation)
    
    return avg_val_loss, score/len(target_columns), preds, original

In [0]:
def calc_spearman(pred, target):
    score = 0
    for i in range(len(target)):
        #print(i, spearmanr(target[:,i], preds[:,i]))
        score += np.nan_to_num(spearmanr(target[:, i], pred[:, i]).correlation)
    return score/len(target)

In [0]:
ACCUM_STEPS = 1

In [0]:
model = QuestModel(n_classes=30).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, eps=4e-5)
criterion = nn.BCEWithLogitsLoss()

In [0]:
oof = np.zeros((train.shape[0], len(target_columns)))
for fold in range(config.fold):
    print('---%d-Fold---'%(fold+1))
    
    patience = 0
    best_loss   = 100.0
    best_score      = -1.
    best_preds = 0
    best_param_loss = None
    best_param_score = None
    
    for epoch in range(config.epochs):
        
        torch.cuda.empty_cache()
        start_time   = time.time()
        
        train_loader, val_loader, val_length, val_idx = get_train_val_loaders(batch_size=config.batch_size, val_batch_size=config.batch_size, ifold=fold)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup, num_training_steps= config.epochs*len(train_loader)//ACCUM_STEPS)
        
        loss_train = train_model(train_loader, optimizer, criterion, scheduler)
        loss_val, score_val, preds, original = val_model(val_loader, val_length, batch_size=config.batch_size)
        print(f'Epoch {(epoch+1)}, train_loss: {loss_train}, val_loss: {loss_val}, score_val: {score_val}, time: {(time.time()-start_time)}')
        

        if score_val > best_score:
            best_score = score_val
            best_param_score = model.state_dict()
            best_preds = preds.copy()
            labels_org = original.copy()
            cv_idx = val_idx.copy()
            print('best_param_score_{}_{}.pt'.format(config.name ,fold+1))
            torch.save(best_param_score, '/content/drive/My Drive/Colab Notebooks/GoogleQuest/input/best_param_score_{}_{}.pt'.format(config.name ,fold+1))
        else:
            patience += 1
            if patience >= config.patience:
                del train_loader, val_loader, loss_train, loss_val, score_val, preds
                torch.cuda.empty_cache()
                gc.collect()
                break
    
        del train_loader, val_loader, loss_train, loss_val, score_val, preds
        torch.cuda.empty_cache()
        gc.collect()
        
    model.load_state_dict(best_param_score)
    print('best_param_score_{}_{}.pt'.format(config.name ,fold+1))
    torch.save(best_param_score, '/content/drive/My Drive/Colab Notebooks/GoogleQuest/input/best_param_score_{}_{}.pt'.format(config.name ,fold+1))   
    oof[cv_idx] = best_preds
    targets[cv_idx] = labels_org

    torch.cuda.empty_cache()
    gc.collect()
    
cv_score = calc_spearman(oof, targets)
print(cv_score)