In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import os, sys, gc, random, multiprocessing, glob, time

DATA_DIR = '/content/drive/My Drive/Colab Notebooks/GoogleQuest/input/google-quest-challenge'
# DATA_DIR = '../input/google-quest-challenge'
# DATA_DIR = 'D:/project/ICF_AutoCapsule_disabled/kaggle/google-quest-challenge'
# BERT_DIR = 'D:/project/ICF_AutoCapsule_disabled/BERT'

In [0]:
# !pip install ../input/sacremoses/sacremoses-master/
# !pip install ../input/transformers/transformers-master/

In [0]:
!pip install transformers
!pip install flashtext



In [0]:
import torch
import torch.nn.functional as F
from torch import nn
from torch.utils import data
from torch.utils.data import DataLoader, Dataset

#from ml_stratifiers import MultilabelStratifiedShuffleSplit, MultilabelStratifiedKFold
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

from scipy.stats import spearmanr

import transformers
from transformers import (
    BertTokenizer, BertModel, BertForSequenceClassification, BertConfig,
    WEIGHTS_NAME, CONFIG_NAME, AdamW, get_linear_schedule_with_warmup, 
    get_cosine_schedule_with_warmup,
)

from tqdm import tqdm
print(transformers.__version__)

2.4.0


In [0]:
## Make results reproducible .Else noone will believe you .
import random

def seed_everything(seed: int):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [0]:
class PipeLineConfig:
    def __init__(self, lr, warmup, epochs, patience, batch_size, seed, name, question_weight,answer_weight,fold,train):
        self.lr = lr
        self.warmup = warmup
        self.epochs = epochs
        self.patience = patience
        self.batch_size = batch_size
        self.seed = seed
        self.name = name
        self.question_weight = question_weight
        self.answer_weight =answer_weight
        self.fold = fold
        self.train = train

In [0]:
config = PipeLineConfig(lr=1e-5, \
                        warmup=0.01, \
                        epochs=20, \
                        patience=3, \
                        batch_size=12, \
                        seed=42, \
                        name='Outlier_not_really', \
                        question_weight=0.5, \
                        answer_weight=0.5, \
                        fold=5, \
                        train=True
                       )

In [0]:
seed_everything(config.seed)

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'
print(device)

cuda


In [0]:
sub = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308,0.00308
1,46,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448,0.00448
2,70,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673,0.00673
3,132,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401,0.01401
4,200,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074,0.02074


In [0]:
target_columns = sub.columns.values[1:].tolist()
target_columns

['question_asker_intent_understanding',
 'question_body_critical',
 'question_conversational',
 'question_expect_short_answer',
 'question_fact_seeking',
 'question_has_commonly_accepted_answer',
 'question_interestingness_others',
 'question_interestingness_self',
 'question_multi_intent',
 'question_not_really_a_question',
 'question_opinion_seeking',
 'question_type_choice',
 'question_type_compare',
 'question_type_consequence',
 'question_type_definition',
 'question_type_entity',
 'question_type_instructions',
 'question_type_procedure',
 'question_type_reason_explanation',
 'question_type_spelling',
 'question_well_written',
 'answer_helpful',
 'answer_level_of_information',
 'answer_plausible',
 'answer_relevance',
 'answer_satisfaction',
 'answer_type_instructions',
 'answer_type_procedure',
 'answer_type_reason_explanation',
 'answer_well_written']

In [0]:
train = pd.read_csv(f'{DATA_DIR}/train.csv')
train.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,question_opinion_seeking,question_type_choice,question_type_compare,question_type_consequence,question_type_definition,question_type_entity,question_type_instructions,question_type_procedure,question_type_reason_explanation,question_type_spelling,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0,What am I losing when using extension tubes in...,After playing around with macro photography on...,ysap,https://photo.stackexchange.com/users/1024,"I just got extension tubes, so here's the skin...",rfusca,https://photo.stackexchange.com/users/1917,http://photo.stackexchange.com/questions/9169/...,LIFE_ARTS,photo.stackexchange.com,1.0,0.333333,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,0.0,1.0
1,1,What is the distinction between a city and a s...,I am trying to understand what kinds of places...,russellpierce,https://rpg.stackexchange.com/users/8774,It might be helpful to look into the definitio...,Erik Schmidt,https://rpg.stackexchange.com/users/1871,http://rpg.stackexchange.com/questions/47820/w...,CULTURE,rpg.stackexchange.com,1.0,1.0,0.0,0.5,1.0,1.0,0.444444,0.444444,0.666667,0.0,0.0,0.666667,0.666667,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.888889,0.888889,0.555556,0.888889,0.888889,0.666667,0.0,0.0,0.666667,0.888889
2,2,Maximum protusion length for through-hole comp...,I'm working on a PCB that has through-hole com...,Joe Baker,https://electronics.stackexchange.com/users/10157,Do you even need grooves? We make several pro...,Dwayne Reid,https://electronics.stackexchange.com/users/64754,http://electronics.stackexchange.com/questions...,SCIENCE,electronics.stackexchange.com,0.888889,0.666667,0.0,1.0,1.0,1.0,0.666667,0.444444,0.333333,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.333333,0.0,0.777778,0.777778,0.555556,1.0,1.0,0.666667,0.0,0.333333,1.0,0.888889
3,3,Can an affidavit be used in Beit Din?,"An affidavit, from what i understand, is basic...",Scimonster,https://judaism.stackexchange.com/users/5151,"Sending an ""affidavit"" it is a dispute between...",Y e z,https://judaism.stackexchange.com/users/4794,http://judaism.stackexchange.com/questions/551...,CULTURE,judaism.stackexchange.com,0.888889,0.666667,0.666667,1.0,1.0,1.0,0.444444,0.444444,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.888889,0.833333,0.333333,0.833333,1.0,0.8,0.0,0.0,1.0,1.0
4,5,How do you make a binary image in Photoshop?,I am trying to make a binary image. I want mor...,leigero,https://graphicdesign.stackexchange.com/users/...,Check out Image Trace in Adobe Illustrator. \n...,q2ra,https://graphicdesign.stackexchange.com/users/...,http://graphicdesign.stackexchange.com/questio...,LIFE_ARTS,graphicdesign.stackexchange.com,1.0,0.666667,0.0,1.0,1.0,1.0,0.666667,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.666667,1.0,1.0,0.8,1.0,0.0,1.0,1.0


In [0]:
test = pd.read_csv(f'{DATA_DIR}/test.csv')
test.head()

Unnamed: 0,qa_id,question_title,question_body,question_user_name,question_user_page,answer,answer_user_name,answer_user_page,url,category,host
0,39,Will leaving corpses lying around upset my pri...,I see questions/information online about how t...,Dylan,https://gaming.stackexchange.com/users/64471,There is no consequence for leaving corpses an...,Nelson868,https://gaming.stackexchange.com/users/97324,http://gaming.stackexchange.com/questions/1979...,CULTURE,gaming.stackexchange.com
1,46,Url link to feature image in the portfolio,I am new to Wordpress. i have issue with Featu...,Anu,https://wordpress.stackexchange.com/users/72927,I think it is possible with custom fields.\n\n...,Irina,https://wordpress.stackexchange.com/users/27233,http://wordpress.stackexchange.com/questions/1...,TECHNOLOGY,wordpress.stackexchange.com
2,70,"Is accuracy, recoil or bullet spread affected ...","To experiment I started a bot game, toggled in...",Konsta,https://gaming.stackexchange.com/users/37545,You do not have armour in the screenshots. Thi...,Damon Smithies,https://gaming.stackexchange.com/users/70641,http://gaming.stackexchange.com/questions/2154...,CULTURE,gaming.stackexchange.com
3,132,Suddenly got an I/O error from my external HDD,I have used my Raspberry Pi as a torrent-serve...,robbannn,https://raspberrypi.stackexchange.com/users/17341,Your Western Digital hard drive is disappearin...,HeatfanJohn,https://raspberrypi.stackexchange.com/users/1311,http://raspberrypi.stackexchange.com/questions...,TECHNOLOGY,raspberrypi.stackexchange.com
4,200,Passenger Name - Flight Booking Passenger only...,I have bought Delhi-London return flights for ...,Amit,https://travel.stackexchange.com/users/29089,I called two persons who work for Saudia (tick...,Nean Der Thal,https://travel.stackexchange.com/users/10051,http://travel.stackexchange.com/questions/4704...,CULTURE,travel.stackexchange.com


## Outlier

In [0]:
train.question_not_really_a_question.value_counts()

0.000000    6013
0.333333      48
0.500000      11
0.666667       4
1.000000       3
Name: question_not_really_a_question, dtype: int64

In [0]:
train.question_type_spelling.value_counts()

0.000000    6068
0.333333       7
0.666667       4
Name: question_type_spelling, dtype: int64

In [0]:
# dict_not_really = {
#     0.000000: 0, 
#     0.333333: 1,
#     0.500000: 1,
#     0.666667: 1,
#     1.000000: 1,
# }

# dict_spelling = {
#     0.000000: 0, 
#     0.333333: 1, 
#     0.666667: 1,
# }

In [0]:
train['label_not_really'] = train.question_not_really_a_question.apply(lambda x: 1 if x>0 else 0)
train['label_spelling'] = train.question_type_spelling.apply(lambda x: 1 if x>0 else 0)

print(train.label_not_really.value_counts())
train.label_spelling.value_counts()

0    6013
1      66
Name: label_not_really, dtype: int64


0    6068
1      11
Name: label_spelling, dtype: int64

In [0]:
target_columns = ['question_not_really_a_question']

## Preprocessing

In [0]:
import re
from flashtext import KeywordProcessor

In [0]:
PUNCTS = {
            '》', '〞', '¢', '‹', '╦', '║', '♪', 'Ø', '╩', '\\', '★', '＋', 'ï', '<', '?', '％', '+', '„', 'α', '*', '〰', '｟', '¹', '●', '〗', ']', '▾', '■', '〙', '↓', '´', '【', 'ᴵ',
            '"', '）', '｀', '│', '¤', '²', '‡', '¿', '–', '」', '╔', '〾', '%', '¾', '←', '〔', '＿', '’', '-', ':', '‧', '｛', 'β', '（', '─', 'à', 'â', '､', '•', '；', '☆', '／', 'π',
            'é', '╗', '＾', '▪', ',', '►', '/', '〚', '¶', '♦', '™', '}', '″', '＂', '『', '▬', '±', '«', '“', '÷', '×', '^', '!', '╣', '▲', '・', '░', '′', '〝', '‛', '√', ';', '】', '▼',
            '.', '~', '`', '。', 'ə', '］', '，', '{', '～', '！', '†', '‘', '﹏', '═', '｣', '〕', '〜', '＼', '▒', '＄', '♥', '〛', '≤', '∞', '_', '[', '＆', '→', '»', '－', '＝', '§', '⋅', 
            '▓', '&', 'Â', '＞', '〃', '|', '¦', '—', '╚', '〖', '―', '¸', '³', '®', '｠', '¨', '‟', '＊', '£', '#', 'Ã', "'", '▀', '·', '？', '、', '█', '”', '＃', '⊕', '=', '〟', '½', '』',
            '［', '$', ')', 'θ', '@', '›', '＠', '｝', '¬', '…', '¼', '：', '¥', '❤', '€', '−', '＜', '(', '〘', '▄', '＇', '>', '₤', '₹', '∅', 'è', '〿', '「', '©', '｢', '∙', '°', '｜', '¡', 
            '↑', 'º', '¯', '♫', '#'
          }


mispell_dict = {"aren't" : "are not", "can't" : "cannot", "couldn't" : "could not",
"couldnt" : "could not", "didn't" : "did not", "doesn't" : "does not",
"doesnt" : "does not", "don't" : "do not", "hadn't" : "had not", "hasn't" : "has not",
"haven't" : "have not", "havent" : "have not", "he'd" : "he would", "he'll" : "he will", "he's" : "he is", "i'd" : "I would",
"i'd" : "I had", "i'll" : "I will", "i'm" : "I am", "isn't" : "is not", "it's" : "it is",
"it'll":"it will", "i've" : "I have", "let's" : "let us", "mightn't" : "might not", "mustn't" : "must not", 
"shan't" : "shall not", "she'd" : "she would", "she'll" : "she will", "she's" : "she is", "shouldn't" : "should not", "shouldnt" : "should not",
"that's" : "that is", "thats" : "that is", "there's" : "there is", "theres" : "there is", "they'd" : "they would", "they'll" : "they will",
"they're" : "they are", "theyre":  "they are", "they've" : "they have", "we'd" : "we would", "we're" : "we are", "weren't" : "were not",
"we've" : "we have", "what'll" : "what will", "what're" : "what are", "what's" : "what is", "what've" : "what have", "where's" : "where is",
"who'd" : "who would", "who'll" : "who will", "who're" : "who are", "who's" : "who is", "who've" : "who have", "won't" : "will not", "wouldn't" : "would not", "you'd" : "you would",
"you'll" : "you will", "you're" : "you are", "you've" : "you have", "'re": " are", "wasn't": "was not", "we'll":" will", "didn't": "did not", "tryin'":"trying"}


kp = KeywordProcessor(case_sensitive=True)
for k, v in mispell_dict.items():
    kp.add_keyword(k, v)

def clean_punct(text):
    text = str(text)
    for punct in PUNCTS:
        text = text.replace(punct, ' {} '.format(punct))
    return text


def preprocessing(text):
    text = text.lower()
    text = re.sub(r'(\&lt)|(\&gt)', ' ', text)
    
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' url ', text)
    text = kp.replace_keywords(text)
    text = clean_punct(text)
    text = re.sub(r'\n\r', ' ', text)
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

In [0]:
category_list = ['CULTURE', 'LIFE_ARTS', 'SCIENCE', 'STACKOVERFLOW', 'TECHNOLOGY']

host_list = ['academia.stackexchange.com', 'android.stackexchange.com',
       'anime.stackexchange.com', 'apple.stackexchange.com',
       'askubuntu.com', 'bicycles.stackexchange.com',
       'biology.stackexchange.com', 'blender.stackexchange.com',
       'boardgames.stackexchange.com', 'chemistry.stackexchange.com',
       'christianity.stackexchange.com', 'codereview.stackexchange.com',
       'cooking.stackexchange.com', 'crypto.stackexchange.com',
       'cs.stackexchange.com', 'dba.stackexchange.com',
       'diy.stackexchange.com', 'drupal.stackexchange.com',
       'dsp.stackexchange.com', 'electronics.stackexchange.com',
       'ell.stackexchange.com', 'english.stackexchange.com',
       'expressionengine.stackexchange.com', 'gamedev.stackexchange.com',
       'gaming.stackexchange.com', 'gis.stackexchange.com',
       'graphicdesign.stackexchange.com', 'judaism.stackexchange.com',
       'magento.stackexchange.com', 'math.stackexchange.com',
       'mathematica.stackexchange.com', 'mathoverflow.net',
       'mechanics.stackexchange.com', 'meta.askubuntu.com',
       'meta.christianity.stackexchange.com',
       'meta.codereview.stackexchange.com', 'meta.math.stackexchange.com',
       'meta.stackexchange.com', 'meta.superuser.com',
       'money.stackexchange.com', 'movies.stackexchange.com',
       'music.stackexchange.com', 'photo.stackexchange.com',
       'physics.stackexchange.com', 'programmers.stackexchange.com',
       'raspberrypi.stackexchange.com', 'robotics.stackexchange.com',
       'rpg.stackexchange.com', 'salesforce.stackexchange.com',
       'scifi.stackexchange.com', 'security.stackexchange.com',
       'serverfault.com', 'sharepoint.stackexchange.com',
       'softwarerecs.stackexchange.com', 'stackoverflow.com',
       'stats.stackexchange.com', 'superuser.com',
       'tex.stackexchange.com', 'travel.stackexchange.com',
       'unix.stackexchange.com', 'ux.stackexchange.com',
       'webapps.stackexchange.com', 'webmasters.stackexchange.com',
       'wordpress.stackexchange.com']

## Dataset

In [0]:
MAX_LEN = 512
#MAX_Q_LEN = 250
#MAX_A_LEN = 259
SEP_TOKEN_ID = 102 # bert-base-uncasedにおけるvocabの'[SEP']が、102番目という意味

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, train_mode=True, labeled=True):
        self.df = df
        self.train_mode = train_mode
        self.labeled = labeled
        #self.tokenizer = BertTokenizer.from_pretrained(BERT_DIR+'/bert-base-uncased')
        #self.tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased/')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __getitem__(self, index):
        """
        token_id列
        segment_id列
        label列
        """
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, torch.tensor(row.category), torch.tensor(row.host), labels
        else:
            return token_ids, seg_ids, torch.tensor(row.category), torch.tensor(row.host)

    def __len__(self):
        return len(self.df)


#     def select_tokens(self, tokens, max_num):
#         if len(tokens) <= max_num:
#             return tokens
#         if self.train_mode:
#             num_remove = len(tokens) - max_num
#             remove_start = random.randint(0, len(tokens)-num_remove-1)
#             return tokens[:remove_start] + tokens[remove_start + num_remove:]
#         else:
#             return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]

    def trim_input(self, title, question, answer, max_sequence_length=MAX_LEN, 
                t_max_len=30, q_max_len=239, a_max_len=239):
        """
        title. question, answerそれぞれのセンテンスを、tokenizeする
        max_lengthに足りない分は、
        """
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)
        a = self.tokenizer.tokenize(answer)

        t_len = len(t)
        q_len = len(q)
        a_len = len(a)

        if (t_len+q_len+a_len+4) > max_sequence_length:

            if t_max_len > t_len:
                """
                titleが短い場合、
                最大長に足りない長さを、半分ずつqとaに加える
                """
                t_new_len = t_len
                a_max_len = a_max_len + math.floor((t_max_len - t_len)/2) # 切り捨て
                q_max_len = q_max_len + math.ceil((t_max_len - t_len)/2) # 切り上げ
            else:
                """
                titleが長い場合、最大長で切る
                """
                t_new_len = t_max_len

            if a_max_len > a_len:
                """
                answerに加えても短い場合、
                最大長に足りない長さを、qに加える
                """
                a_new_len = a_len 
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len


            if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

            t = t[:t_new_len]
            q = q[:q_new_len]
            a = a[:a_new_len]

        return t, q, a
        
    def get_token_ids(self, row):
        t_tokens, q_tokens, a_tokens = self.trim_input(row.question_title, row.question_body, row.answer)
        
        # BERTの入力タイプに変換([CLS]と[SEP]をつないで、１つのsetentenceに)
        tokens = ['[CLS]'] + t_tokens + ['[SEP]'] + q_tokens + ['[SEP]'] + a_tokens + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < MAX_LEN:
            """0で後ろからpadding"""
            token_ids += [0] * (MAX_LEN - len(token_ids))
        ids = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(ids)  # segment_embを区別するindex
        return ids, seg_ids
    
    def get_seg_ids(self, ids):
        """
        いくつめの文かを区別するsegment_idを、各文字に振る
        """
        seg_ids = torch.zeros_like(ids) # [max_len]のtorch_tensor
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID: # [SEP]の場合
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)  # bert-base_uncasedのvocabで、[PAD]は0番目であるので、PADの部分のindexだけ抽出
        seg_ids[pad_idx] = 0

        return seg_ids

    def get_label(self, row):
        #print(row[target_columns].values)
        return torch.tensor(row[target_columns].values.astype(np.float32))

    def collate_fn(self, batch):
        """
        labelデータを持つモードと、ない完全な推論モードでは、batchのshapeが異なるので(labelが2番目の要素にあるなし)
        """
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        category = torch.stack([x[2] for x in batch])
        host = torch.stack([x[3] for x in batch])
    
        if self.labeled:
            labels = torch.stack([x[-1] for x in batch])
            return token_ids, seg_ids, category, host, labels
        else:
            return token_ids, seg_ids, category, host

In [0]:
def get_train_val_loaders(batch_size=4, val_batch_size=4, ifold=0):
    #df = pd.read_csv(f'{DATA_DIR}/train.csv')
    df = train.copy()

    # cleaning
    df['question_title'] = df['question_title'].apply(lambda x : preprocessing(x))
    df['question_body'] = df['question_body'].apply(lambda x : preprocessing(x))
    df['answer'] = df['answer'].apply(lambda x : preprocessing(x))
    
    # label encode
    le_category = LabelEncoder()
    le_category.fit(category_list)
    for c in set(df.category):
        if c not in category_list:
            df.category = df.category.replace(c, np.nan)
            df.category = df.category.fillna(train.category.mode()[0])
    df.category = le_category.transform(df.category)

    
    le_host = LabelEncoder()
    le_host.fit(host_list)
    for c in set(df.host):
        if c not in host_list:
            df.host = df.host.replace(c, np.nan)
            df.host = df.host.fillna(train.host.mode()[0])
    df.host = le_host.transform(df.host)


    df = shuffle(df, random_state=1234)
    #gkf = GroupKFold(n_splits=5).split(X=df.question_body, groups=df.question_body)
    gkf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42).split(X=df.question_body, y=df['label_not_really'].values)
    
    for fold, (train_idx, valid_idx) in enumerate(gkf):
        if fold == ifold:
            df_train = df.iloc[train_idx]
            df_val = df.iloc[valid_idx]
            break

    print('train', df_train.shape)
    print('val', df_val.shape)

    ds_train = QuestDataset(df_train, train_mode=True)
    train_loader = torch.utils.data.DataLoader(ds_train, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=ds_train.collate_fn, drop_last=True)
    train_loader.num = len(df_train)

    ds_val = QuestDataset(df_val, train_mode=False)
    val_loader = torch.utils.data.DataLoader(ds_val, batch_size=val_batch_size, shuffle=False, num_workers=0, collate_fn=ds_val.collate_fn, drop_last=False)
    val_loader.num = len(df_val)
    val_loader.df = df_val

    return train_loader, val_loader, df_val.shape[0]


def get_test_loader(batch_size=4):
    #df = pd.read_csv(f'{DATA_DIR}/test.csv')
    df = test.copy()


    # cleaning
    df['question_title'] = df['question_title'].apply(lambda x : preprocessing(x))
    df['question_body'] = df['question_body'].apply(lambda x : preprocessing(x))
    df['answer'] = df['answer'].apply(lambda x : preprocessing(x))
    
    # label encode
    le_category = LabelEncoder()
    le_category.fit(category_list)
    for c in set(df.category):
        if c not in category_list:
            df.category = df.category.replace(c, np.nan)
            df.category = df.category.fillna(train.category.mode()[0])
    df.category = le_category.transform(df.category)
    
    le_host = LabelEncoder()
    le_host.fit(host_list)
    for c in set(df.host):
        if c not in host_list:
            df.host = df.host.replace(c, np.nan)
            df.host = df.host.fillna(train.host.mode()[0])
    df.host = le_host.transform(df.host)


    ds_test = QuestDataset(df, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    
    return loader


def get_train_loader(batch_size=4):
    #df = pd.read_csv(f'{DATA_DIR}/test.csv')
    df = train.copy()


    # cleaning
    df['question_title'] = df['question_title'].apply(lambda x : preprocessing(x))
    df['question_body'] = df['question_body'].apply(lambda x : preprocessing(x))
    df['answer'] = df['answer'].apply(lambda x : preprocessing(x))
    
    # label encode
    le_category = LabelEncoder()
    le_category.fit(category_list)
    for c in set(df.category):
        if c not in category_list:
            df.category = df.category.replace(c, np.nan)
            df.category = df.category.fillna(train.category.mode()[0])
    df.category = le_category.transform(df.category)
    
    le_host = LabelEncoder()
    le_host.fit(host_list)
    for c in set(df.host):
        if c not in host_list:
            df.host = df.host.replace(c, np.nan)
            df.host = df.host.fillna(train.host.mode()[0])
    df.host = le_host.transform(df.host)


    ds_test = QuestDataset(df, train_mode=False, labeled=False)
    loader = torch.utils.data.DataLoader(ds_test, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=ds_test.collate_fn, drop_last=False)
    loader.num = len(df)
    
    return loader

In [0]:
class QuestModel(nn.Module):
    def __init__(self, n_classes=30):
        super(QuestModel, self).__init__()
        self.model_name = 'QuestModel'
        #self.bert_model = BertModel.from_pretrained(BERT_DIR+'/bert-base-uncased/')
        #self.bert_model = BertModel.from_pretrained('../input/bert-base-uncased/')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        
        self.emb_category = nn.Embedding(len(category_list), 3)
        self.emb_host = nn.Embedding(len(host_list), 5)
        
        self.fc = nn.Linear(768+3+5, n_classes)

    def forward(self, ids, seg_ids, category, host):
        attention_mask = (ids > 0)  # ids==0([PAD])部分だけFalseとなるので、そこだけattention_weightを0に
        layers, pool_out = self.bert_model(input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)
        #print(layers.size())  # (batch_size,sequence_length, 768)
        #print(pool_out.size())  # (batch_size, 768), first token of last layerをいじったもの
        
        out = F.avg_pool1d(layers.transpose(1,2), kernel_size=layers.size()[1]).squeeze()  # sequence方向は中央値だけ抽出
        
        emb_category = self.emb_category(category)
        emb_host = self.emb_host(host)
        
        #print(out.shape)
        #print(emb_category.shape)
        
        out = torch.cat([out, emb_category], dim=-1)
        out = torch.cat([out, emb_host], dim=-1)
        
        #print(out.shape)
        
        out = F.dropout(out, p=0.2, training=self.training)
        
#         out = F.dropout(layers[-1][:, 0, :], p=0.2, training=self.training)
#         out =  F.dropout(pool_out, p=0.2, training=self.training)
        logit = self.fc(out)
        return logit # 単に30種類の出力値を算出
    

In [0]:
def calc_spearman(preds, targets):
    score = 0
    for i in range(targets.shape[1]):
        score += np.nan_to_num(spearmanr(targets[:, i], preds[:, i]).correlation)
    return score/targets.shape[1]

In [0]:
ACCUM_STEPS = 1

In [0]:
def create_model(model_file):
    model = QuestModel(n_classes=1).to(device)
    model.load_state_dict(torch.load(model_file))
    model = model
    #model = DataParallel(model)
    return model

def create_models():
    models = []
    for fold in range(config.fold):
        model = create_model('/content/drive/My Drive/Colab Notebooks/GoogleQuest/best_param_score_{}_{}.pt'.format(config.name,(fold+1)))
        model.eval()
        models.append(model)
    return models
  
def predict(models, test_loader):
    all_scores = []
    with torch.no_grad():
        for ids, seg_ids, category, host in tqdm(test_loader, total=test_loader.num // test_loader.batch_size):
            ids, seg_ids, category, host = ids.cuda(), seg_ids.cuda(), category.cuda(), host.cuda()
            scores = []
            for model in models:
                outputs = torch.sigmoid(model(ids, seg_ids, category, host)).cpu()
                scores.append(outputs)
            all_scores.append(torch.mean(torch.stack(scores), 0))

    all_scores = torch.cat(all_scores, 0).numpy()
    
    return all_scores

In [0]:
train_loader = get_train_loader(batch_size=32)
models = create_models()

preds = predict(models, train_loader)
train['pred_not_really'] = preds.reshape(-1,)

HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…




190it [11:10,  3.46s/it]                         


In [0]:
train['pred_not_really'] = preds.reshape(-1,)

In [0]:
del models
torch.cuda.empty_cache()
gc.collect()

444

## Prediction

In [0]:
config = PipeLineConfig(lr=1e-5, \
                        warmup=0.01, \
                        epochs=20, \
                        patience=3, \
                        batch_size=12, \
                        seed=42, \
                        name='Prediction_not_really', \
                        question_weight=0.5, \
                        answer_weight=0.5, \
                        fold=5, \
                        train=True
                       )

In [0]:
MAX_LEN = 512
#MAX_Q_LEN = 250
#MAX_A_LEN = 259
SEP_TOKEN_ID = 102 # bert-base-uncasedにおけるvocabの'[SEP']が、102番目という意味

class QuestDataset(torch.utils.data.Dataset):
    def __init__(self, df, train_mode=True, labeled=True):
        self.df = df
        self.train_mode = train_mode
        self.labeled = labeled
        #self.tokenizer = BertTokenizer.from_pretrained(BERT_DIR+'/bert-base-uncased')
        #self.tokenizer = BertTokenizer.from_pretrained('../input/bert-base-uncased/')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    def __getitem__(self, index):
        """
        token_id列
        segment_id列
        label列
        """
        row = self.df.iloc[index]
        token_ids, seg_ids = self.get_token_ids(row)
        if self.labeled:
            labels = self.get_label(row)
            return token_ids, seg_ids, torch.tensor(row.category), torch.tensor(row.host), torch.tensor(row.pred_not_really, dtype=torch.float), labels
        else:
            return token_ids, seg_ids, torch.tensor(row.category), torch.tensor(row.host), torch.tensor(row.pred_not_really, dtype=torch.float)

    def __len__(self):
        return len(self.df)


#     def select_tokens(self, tokens, max_num):
#         if len(tokens) <= max_num:
#             return tokens
#         if self.train_mode:
#             num_remove = len(tokens) - max_num
#             remove_start = random.randint(0, len(tokens)-num_remove-1)
#             return tokens[:remove_start] + tokens[remove_start + num_remove:]
#         else:
#             return tokens[:max_num//2] + tokens[-(max_num - max_num//2):]

    def trim_input(self, title, question, answer, max_sequence_length=MAX_LEN, 
                t_max_len=30, q_max_len=239, a_max_len=239):
        """
        title. question, answerそれぞれのセンテンスを、tokenizeする
        max_lengthに足りない分は、
        """
        t = self.tokenizer.tokenize(title)
        q = self.tokenizer.tokenize(question)
        a = self.tokenizer.tokenize(answer)

        t_len = len(t)
        q_len = len(q)
        a_len = len(a)

        if (t_len+q_len+a_len+4) > max_sequence_length:

            if t_max_len > t_len:
                """
                titleが短い場合、
                最大長に足りない長さを、半分ずつqとaに加える
                """
                t_new_len = t_len
                a_max_len = a_max_len + math.floor((t_max_len - t_len)/2) # 切り捨て
                q_max_len = q_max_len + math.ceil((t_max_len - t_len)/2) # 切り上げ
            else:
                """
                titleが長い場合、最大長で切る
                """
                t_new_len = t_max_len

            if a_max_len > a_len:
                """
                answerに加えても短い場合、
                最大長に足りない長さを、qに加える
                """
                a_new_len = a_len 
                q_new_len = q_max_len + (a_max_len - a_len)
            elif q_max_len > q_len:
                a_new_len = a_max_len + (q_max_len - q_len)
                q_new_len = q_len
            else:
                a_new_len = a_max_len
                q_new_len = q_max_len


            if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
                raise ValueError("New sequence length should be %d, but is %d" 
                                 % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

            t = t[:t_new_len]
            q = q[:q_new_len]
            a = a[:a_new_len]

        return t, q, a
        
    def get_token_ids(self, row):
        t_tokens, q_tokens, a_tokens = self.trim_input(row.question_title, row.question_body, row.answer)
        
        # BERTの入力タイプに変換([CLS]と[SEP]をつないで、１つのsetentenceに)
        tokens = ['[CLS]'] + t_tokens + ['[SEP]'] + q_tokens + ['[SEP]'] + a_tokens + ['[SEP]']
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        if len(token_ids) < MAX_LEN:
            """0で後ろからpadding"""
            token_ids += [0] * (MAX_LEN - len(token_ids))
        ids = torch.tensor(token_ids)
        seg_ids = self.get_seg_ids(ids)  # segment_embを区別するindex
        return ids, seg_ids
    
    def get_seg_ids(self, ids):
        """
        いくつめの文かを区別するsegment_idを、各文字に振る
        """
        seg_ids = torch.zeros_like(ids) # [max_len]のtorch_tensor
        seg_idx = 0
        first_sep = True
        for i, e in enumerate(ids):
            seg_ids[i] = seg_idx
            if e == SEP_TOKEN_ID: # [SEP]の場合
                if first_sep:
                    first_sep = False
                else:
                    seg_idx = 1
        pad_idx = torch.nonzero(ids == 0)  # bert-base_uncasedのvocabで、[PAD]は0番目であるので、PADの部分のindexだけ抽出
        seg_ids[pad_idx] = 0

        return seg_ids

    def get_label(self, row):
        #print(row[target_columns].values)
        return torch.tensor(row[target_columns].values.astype(np.float32))

    def collate_fn(self, batch):
        """
        labelデータを持つモードと、ない完全な推論モードでは、batchのshapeが異なるので(labelが2番目の要素にあるなし)
        """
        token_ids = torch.stack([x[0] for x in batch])
        seg_ids = torch.stack([x[1] for x in batch])
        category = torch.stack([x[2] for x in batch])
        host = torch.stack([x[3] for x in batch])
        pred = torch.stack([x[4] for x in batch])
    
        if self.labeled:
            labels = torch.stack([x[-1] for x in batch])
            return token_ids, seg_ids, category, host, pred, labels
        else:
            return token_ids, seg_ids, category, host, pred

In [0]:
class QuestModel(nn.Module):
    def __init__(self, n_classes=30):
        super(QuestModel, self).__init__()
        self.model_name = 'QuestModel'
        #self.bert_model = BertModel.from_pretrained(BERT_DIR+'/bert-base-uncased/')
        #self.bert_model = BertModel.from_pretrained('../input/bert-base-uncased/')
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        
        self.emb_category = nn.Embedding(len(category_list), 3)
        self.emb_host = nn.Embedding(len(host_list), 5)
        
        self.fc = nn.Linear(768+3+5+1, n_classes)

    def forward(self, ids, seg_ids, category, host, pred):
        attention_mask = (ids > 0)  # ids==0([PAD])部分だけFalseとなるので、そこだけattention_weightを0に
        layers, pool_out = self.bert_model(input_ids=ids, token_type_ids=seg_ids, attention_mask=attention_mask)
        #print(layers.size())  # (batch_size,sequence_length, 768)
        #print(pool_out.size())  # (batch_size, 768), first token of last layerをいじったもの
        
        out = F.avg_pool1d(layers.transpose(1,2), kernel_size=layers.size()[1]).squeeze()  # sequence方向は中央値だけ抽出
        
        emb_category = self.emb_category(category)
        emb_host = self.emb_host(host)
        
        #print(out.shape)
        #print(emb_category.shape)
        
        out = torch.cat([out, emb_host], dim=-1)
        out = torch.cat([out, emb_category], dim=-1)
        
        # print(pred.size())
        # print(out.size())
        out = torch.cat([out, pred.unsqueeze(1)], dim=-1)
        
        #print(out.shape)
        
        out = F.dropout(out, p=0.2, training=self.training)
        
#         out = F.dropout(layers[-1][:, 0, :], p=0.2, training=self.training)
#         out =  F.dropout(pool_out, p=0.2, training=self.training)
        logit = self.fc(out)
        return logit # 単に30種類の出力値を算出

In [0]:
def train_model(train_loader, optimizer, criterion, scheduler):
    model.train()
    avg_loss = 0.    
    for idx, batch in enumerate(tqdm(train_loader)):
        ids_train, seg_ids_train, category_train, host_train, pred_train, label_ids_train = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device), batch[4].to(device), batch[5].to(device)
        
        #print(host_train)
        
        logits = model(ids_train, seg_ids_train, category_train, host_train, pred_train)
        #logits = torch.sigmoid(model(ids_train, seg_ids_train))
        
        loss = criterion(logits, label_ids_train)
        #loss = config.question_weight*criterion(logits[:,0:21], label_ids_train[:,0:21]) + config.answer_weight*criterion(logits[:,21:30], label_ids_train[:,21:30])
        
        loss.backward()
        
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
        avg_loss += loss.item() / len(train_loader)
        del ids_train, seg_ids_train, label_ids_train

    torch.cuda.empty_cache()
    gc.collect()
    return avg_loss

def val_model(val_loader, val_length, batch_size=8):
    model.eval() # eval mode  
    avg_val_loss = 0.
    
    valid_preds = np.zeros(val_length)
    original = np.zeros(val_length)
    
    with torch.no_grad():
        for idx, batch in enumerate(tqdm(val_loader)):
            ids_val, seg_ids_val, category_val, host_val, pred_val, labels = batch[0].to(device), batch[1].to(device), batch[2].to(device), batch[3].to(device), batch[4].to(device), batch[5].to(device)
            
            logits = torch.sigmoid(model(ids_val, seg_ids_val, category_val, host_val, pred_val))
            
            avg_val_loss += criterion(logits, labels).item() / len(val_loader)
            valid_preds[idx*batch_size : (idx+1)*batch_size] = logits.detach().cpu().squeeze().numpy()
            original[idx*batch_size : (idx+1)*batch_size]    = labels.detach().cpu().squeeze().numpy()
        
        preds = torch.tensor(valid_preds).numpy()
        #preds = torch.sigmoid(torch.tensor(valid_preds)).numpy()
        
        rho_val = spearmanr(original, preds).correlation
        print('\r val_spearman-rho: %s' % (str(round(rho_val, 5))), end = 100*' '+'\n')
        
        # for i in range(len(target_columns)):
        #     print(i, spearmanr(original[:,i], preds[:,i]))
        score = np.nan_to_num(spearmanr(original, preds).correlation)
        
    
    return avg_val_loss, score

In [0]:
model = QuestModel(n_classes=1).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, eps=4e-5)
criterion = nn.BCEWithLogitsLoss()

In [0]:
for fold in range(config.fold):
    print('---%d-Fold---'%(fold+1))
    
    patience = 0
    best_loss   = 100.0
    best_score      = -1.
    best_preds = 0
    best_param_loss = None
    best_param_score = None
    
    for epoch in range(config.epochs):
        
        torch.cuda.empty_cache()
        start_time   = time.time()
        
        train_loader, val_loader, val_length = get_train_val_loaders(batch_size=config.batch_size, val_batch_size=config.batch_size, ifold=fold)
        scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=config.warmup, num_training_steps= config.epochs*len(train_loader)//ACCUM_STEPS)
        
        loss_train = train_model(train_loader, optimizer, criterion, scheduler)
        loss_val, score_val = val_model(val_loader, val_length, batch_size=config.batch_size)
        print(f'Epoch {(epoch+1)}, train_loss: {loss_train}, val_loss: {loss_val}, score_val: {score_val}, time: {(time.time()-start_time)}')
        

        if score_val > best_score:
            best_score = score_val
            best_param_score = model.state_dict()
            print('best_param_score_{}_{}.pt'.format(config.name ,fold+1))
            torch.save(best_param_score, '/content/drive/My Drive/Colab Notebooks/GoogleQuest/best_param_score_{}_{}.pt'.format(config.name ,fold+1))
        else:
            patience += 1
            if patience >= config.patience:
                del train_loader, val_loader, loss_train, loss_val, score_val
                torch.cuda.empty_cache()
                gc.collect()
                break
    
        del train_loader, val_loader, loss_train, loss_val, score_val
        torch.cuda.empty_cache()
        gc.collect()
        
    model.load_state_dict(best_param_score)
    print('best_param_score_{}_{}.pt'.format(config.name ,fold+1))
    torch.save(best_param_score, '/content/drive/My Drive/Colab Notebooks/GoogleQuest/best_param_score_{}_{}.pt'.format(config.name ,fold+1))   

    torch.cuda.empty_cache()
    gc.collect()

---1-Fold---
train (4863, 44)
val (1216, 44)


100%|██████████| 405/405 [06:44<00:00,  1.00s/it]
100%|██████████| 102/102 [00:50<00:00,  2.40it/s]


 val_spearman-rho: 0.02522                                                                                                    
Epoch 1, train_loss: 0.03797166354229884, val_loss: 0.6943009706104505, score_val: 0.02521883379619639, time: 462.05274844169617
best_param_score_Prediction_not_really_1.pt
train (4863, 44)
val (1216, 44)


 17%|█▋        | 70/405 [01:10<05:29,  1.02it/s]