# ライブラリー

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import itertools
import nltk
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
from tqdm import tqdm
import pickle

from sklearn.model_selection import StratifiedKFold
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch import nn
import tokenizers
from transformers import RobertaConfig, RobertaTokenizer, RobertaModel, RobertaForSequenceClassification
from transformers import BertConfig, BertTokenizer, BertModel



# SEEDの固定

In [2]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True

seed = 42
seed_everything(seed)

# TestDataの準備

In [3]:
DATA_PATH = '../input/tweet-disaster-original-train-ver'

test_df = pickle.load(open(f'{DATA_PATH}1/test_df.pkl', 'rb'))
test_df

Unnamed: 0,id,keyword,text
0,0,other,"[just, happened, a, terrible, car, crash]"
1,2,other,"[heard, about, #, earthquake, is, different, c..."
2,3,other,"[there, is, a, forest, fire, at, spot, pond,, ..."
3,9,other,"[apocalypse, lighting, ., #, spokane, #, wildf..."
4,11,other,"[typhoon, soudelor, kills, 28, in, china, and,..."
...,...,...,...
3258,10861,other,"[earthquake, safety, los, angeles, safety, fas..."
3259,10865,other,"[storm, in, ri, worse, than, last, hurricane, ..."
3260,10868,other,"[green, line, derailment, in, chicago]"
3261,10874,other,"[meg, issues, hazardous, weather, outlook, (, ..."


In [4]:
class TweetDataset(Dataset):
    
    def __init__(self, df, max_len=50, flag='train'):
        self.df = df
        self.max_len = max_len
        self.flag = flag
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
        
    # getitem()で返す必要のあるものは ids, masks, target
    def __getitem__(self, index):
        row = self.df.iloc[index]
        ids, masks = self.get_input_data(row)
        
        data = {}
        data['ids'] = ids
        data['masks'] = masks
        
        if self.flag == 'train':
            data['target'] = torch.tensor(row['target'])
            
        return data
    
    def __len__(self):
        return len(self.df)
    
    def get_input_data(self, row):
        text = ' '.join(row['text'])
        text_ids = self.tokenizer.encode(text)
        keyword_id = self.tokenizer.encode(row['keyword'])          
                  
        # [CLS][keyword][SEP][text][SEP] -> input_ids
        text_ids = keyword_id + text_ids[1:]
        
        pad_len = self.max_len - len(text_ids)
        if pad_len > 0:
            text_ids += [1] * pad_len
            
        if len(text_ids) > self.max_len:
            text_ids = text_ids[:self.max_len]
            
        text_ids = torch.tensor(text_ids)
        masks = torch.where(text_ids != 1, torch.tensor(1), torch.tensor(0))
        
        return text_ids, masks

In [5]:
def get_test_loader(df, batch_size=8):
    
    test_loader = DataLoader(TweetDataset(df, flag='test'),
                             batch_size=batch_size,
                             shuffle=False)
    
    return test_loader

test_loader = get_test_loader(test_df)

del test_df
gc.collect()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=898823.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




29

# pre-trained model 読み込み

In [6]:
def create_model():
    model = RobertaForSequenceClassification.from_pretrained('roberta-large',
                                                              num_labels = 2,           
                                                              output_attentions = False, 
                                                              output_hidden_states = False)
    model.train();
    
    return model

In [7]:
predictions = []
models = []

for fold in range(1,4):
        
    model = create_model()
    model.cuda().eval()
    model.load_state_dict(torch.load(f'{DATA_PATH}1/roberta_fold{fold}.pth'))
    models.append(model)

    
for model in models:
    
    pred = []
    for data in test_loader:
        ids = data['ids'].cuda()
        masks = data['masks'].cuda()
        
        with torch.no_grad():
            output = model(ids, masks)
            logits = output[0]
            logits = logits.cpu().numpy()
            
            pred.append(logits)
            
    pred = list(itertools.chain(*pred))       
    predictions.append(pred)
    

# 各foldのモデルの出力を平均をとる(0or1にしてからではない)
predictions = torch.tensor(predictions)
predictions = torch.mean(predictions, dim=0)
predictions = torch.softmax(predictions, dim=1)
predictions = torch.argmax(predictions, dim=1)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=482.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1425941629.0, style=ProgressStyle(descr…




In [8]:
submission = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission['target'] = predictions
submission.to_csv('submission.csv', index=False)