In [1]:
import numpy as np
import pandas as pd 
import re
import tqdm.notebook as tqdm
import transformers
from transformers import AutoTokenizer
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import os

In [2]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [4]:
DEVICE = torch.device('cuda')

MAX_LENGTH = 200
DOC_STRIDE = 128
BATCH_SIZE = 4

MODEL_PATH = 'roberta-large'

EPOCHS = 13

In [5]:
def clean_text(text):
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'@\w+', '', text)
    return text

def preprocess_data(data_to_process):
    data = data_to_process.copy()
    
    data['text'] = data['text'].apply(lambda x : clean_text(x))
    
    return data

In [6]:
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_PATH)

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [7]:
p_train_data = preprocess_data(train_data)
p_test_data = preprocess_data(test_data)

In [8]:
p_train_data.head(10)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this earthquake Ma...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive wildfires evacuation ord...",1
4,7,,,Just got sent this photo from Ruby Alaska as s...,1
5,8,,,RockyFire Update => California Hwy. 20 closed ...,1
6,10,,,flood disaster Heavy rain causes flash floodin...,1
7,13,,,I'm on top of the hill and I can see a fire in...,1
8,14,,,There's an emergency evacuation happening now ...,1
9,15,,,I'm afraid that the tornado is coming to our a...,1


In [9]:
def tokenize_train_data(datas):
    datas = datas.reset_index().to_dict(orient='index')
    
    tokenized_datas = {
                        'input_ids' : [],
                        'attention_mask' : [],
                        'target':[]
                      }
    for data_idx in tqdm.tqdm_notebook(range(len(datas))):
        data = datas[data_idx]

        tokenized_data = TOKENIZER(text = data['text'],
                                   padding='max_length', 
                                   max_length=MAX_LENGTH, 
                                   truncation = 'only_first',
                                   stride=DOC_STRIDE
                                      )
        tokenized_datas["input_ids"].append(tokenized_data["input_ids"])
        tokenized_datas["attention_mask"].append(tokenized_data["attention_mask"])
        tokenized_datas["target"].append(data['target'])
    return tokenized_datas

def tokenize_test_data(datas):
    datas = datas.reset_index().to_dict(orient='index')
    
    tokenized_datas = {
                        'input_ids' : [],
                        'attention_mask' : []
                      }
    for data_idx in tqdm.tqdm_notebook(range(len(datas))):
        data = datas[data_idx]

        tokenized_data = TOKENIZER(text = data['text'],
                                   padding='max_length', 
                                   max_length=MAX_LENGTH, 
                                   truncation = 'only_first',
                                   stride=DOC_STRIDE
                                      )
        tokenized_datas["input_ids"].append(tokenized_data["input_ids"])
        tokenized_datas["attention_mask"].append(tokenized_data["attention_mask"])
        
    return tokenized_datas

In [10]:
class TweetTrainTensorDataset:
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data['input_ids'])
    
    def __getitem__(self, idx):
        
        data = {}
        data['input_ids'] = torch.tensor(self.tokenized_data['input_ids'][idx])
        data['attention_mask'] = torch.tensor(self.tokenized_data['attention_mask'][idx])
        data['target'] = torch.tensor(self.tokenized_data['target'][idx], dtype=torch.float32)
        return data
    
class TweetTestTensorDataset:
    def __init__(self, tokenized_data):
        self.input_ids = tokenized_data['input_ids']
        self.attention_masks = tokenized_data['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    
    def __getitem__(self, idx):
        
        data = {}
        data['input_ids'] = torch.tensor(self.input_ids[idx])
        data['attention_mask'] = torch.tensor(self.attention_masks[idx])
        return data

In [11]:
t_train_data = tokenize_train_data(p_train_data)
t_test_data = tokenize_test_data(p_test_data)

  0%|          | 0/7613 [00:00<?, ?it/s]

  0%|          | 0/3263 [00:00<?, ?it/s]

In [12]:
dataset = TweetTrainTensorDataset(t_train_data)
test_dataset = TweetTestTensorDataset(t_test_data)

In [13]:
train_dataset, valid_dataset = train_test_split(dataset, train_size=0.8, shuffle=True, random_state=1)

train_dl = DataLoader(train_dataset, BATCH_SIZE, pin_memory=True, shuffle=True, num_workers=os.cpu_count())
valid_dl = DataLoader(valid_dataset, BATCH_SIZE, pin_memory=True, num_workers=os.cpu_count())
test_dl = DataLoader(test_dataset, BATCH_SIZE)

In [14]:
def eval_fn(model, valid_dl, device):
    model.eval()
    losses =[]
    loss_sum = None
    f1_scores = []
    with torch.no_grad():
        for data in tqdm.tqdm_notebook(valid_dl, total=len(valid_dl)):
            for k, v in data.items():
                data[k] = v.to(device)
            preds, loss = model(data)
            
            if loss_sum == None:
                loss_sum = loss
            else:
                loss_sum += loss
                
            losses.append(loss.item())
            f1_scores.append(f1_score(data['target'].cpu(),preds, zero_division=1))
            
    print(f'Val_Loss : {sum(losses) / len(losses):.5f}, F1 : {sum(f1_scores) / len(f1_scores):.5f}')

    return loss_sum
    
def train_fn(model, train_dl, optimizer, device, scheduler = None):
    model.train()
    losses = []
    f1_scores = []
    optimizer.zero_grad()
    for i, data in tqdm.tqdm_notebook(enumerate(train_dl), total=len(train_dl)):
        for k, v in data.items():
            data[k] = v.to(device)

        preds, loss = model(data)
        
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()

        losses.append(loss.item())
        f1_scores.append(f1_score(data['target'].cpu(),preds, zero_division = 1))
        print(f'Loss : {sum(losses) / len(losses):.5f}, F1 : {sum(f1_scores) / len(f1_scores):.5f}', end="\r")
        
    return losses

In [15]:
class TweetModel(nn.Module):
    def __init__(self,roberta_path):
        super(TweetModel, self).__init__()
        
        self.loss_fn = nn.functional.binary_cross_entropy_with_logits

        self.roberta = transformers.RobertaForSequenceClassification.from_pretrained(roberta_path, num_labels = 1)
        self.sigmoid = torch.sigmoid
     
    def forward(self, data):
        output= self.roberta(data['input_ids'], data['attention_mask'])
        logits = self.sigmoid(output[0])
        loss = self.loss_fn(logits, data['target'].reshape(-1,1))
        
        preds = logits.cpu().detach().numpy()
        
        preds[preds >= 0.5] = 1 
        preds[preds < 0.5] = 0
        
        return preds, loss
    
    def predict(self, data):
        
        output= self.roberta(data['input_ids'], data['attention_mask'])
        logits = self.sigmoid(output[0])
        preds = logits.cpu().detach().numpy()
        
        preds[preds > 0.5] = 1 
        preds[preds <= 0.5] = 0
        
        return preds

In [16]:
def train_model(epochs, model, train_dl, valid_dl, optimizer, scheduler, device):
    
    for epoch in tqdm.tqdm_notebook(range(epochs)):
        losses = []
        
        model.train()
        
        losses = train_fn(model, train_dl, optimizer, device)
        val_losses = eval_fn(model, valid_dl, device)
        
        scheduler.step(val_losses)

        print(f'EPOCH : {epoch}, Loss : {sum(losses) / len(losses):5f}') 

In [17]:
model = TweetModel(MODEL_PATH)

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifie

In [18]:
optimizer = optim.AdamW(model.parameters(), lr=1e-7, eps=1e-6, weight_decay=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)

In [19]:
model.to(DEVICE)

TweetModel(
  (roberta): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense):

In [20]:
%%time
train_model(EPOCHS, model, train_dl, valid_dl, optimizer, scheduler, DEVICE)

  0%|          | 0/13 [00:00<?, ?it/s]

  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.73743, F1 : 0.13735

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.72439, F1 : 0.11286
EPOCH : 0, Loss : 0.737433


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.70577, F1 : 0.10768

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.68947, F1 : 0.11286
EPOCH : 1, Loss : 0.705769


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.69272, F1 : 0.10177

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.68346, F1 : 0.11286
EPOCH : 2, Loss : 0.692723


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.68535, F1 : 0.10177

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.67519, F1 : 0.11286
EPOCH : 3, Loss : 0.685350


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.67257, F1 : 0.09893

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.66088, F1 : 0.11286
EPOCH : 4, Loss : 0.672574


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.65266, F1 : 0.35161

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.63089, F1 : 0.65957
EPOCH : 5, Loss : 0.652658


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.61991, F1 : 0.67082

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.61002, F1 : 0.74106
EPOCH : 6, Loss : 0.619913


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.60840, F1 : 0.71082

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.60610, F1 : 0.75352
EPOCH : 7, Loss : 0.608397


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.60460, F1 : 0.72390

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.60263, F1 : 0.74996
EPOCH : 8, Loss : 0.604604


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.60113, F1 : 0.73347

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.60048, F1 : 0.73977
EPOCH : 9, Loss : 0.601131


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.59790, F1 : 0.74760

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.60008, F1 : 0.75062
EPOCH : 10, Loss : 0.597902


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.59706, F1 : 0.74066

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.59912, F1 : 0.74458
EPOCH : 11, Loss : 0.597057


  0%|          | 0/1523 [00:00<?, ?it/s]

Loss : 0.59471, F1 : 0.75040

  0%|          | 0/381 [00:00<?, ?it/s]

Val_Loss : 0.59869, F1 : 0.74587
EPOCH : 12, Loss : 0.594710
CPU times: user 1h 34min 44s, sys: 22min 48s, total: 1h 57min 33s
Wall time: 1h 58min 39s


In [21]:
def create_submission(model, test_data, test_dl, device):
    model.eval()
    predictions = []
    with torch.no_grad():
        for data in tqdm.tqdm_notebook(test_dl, total=len(test_dl)):
            for k, v in data.items():
                data[k] = v.to(device)
            preds = model.predict(data)
            
            preds = [int(n) for n in preds]
            
            predictions.extend(preds)

            
    submission = {'id' : test_data['id'].to_list(),
                 'target': predictions}
    
    return submission
            

In [22]:
submission = create_submission(model, test_data, test_dl, DEVICE)

  0%|          | 0/816 [00:00<?, ?it/s]

In [23]:
df_submission = pd.DataFrame(submission)
df_submission.to_csv('submission.csv', index=False)
df_submission = pd.read_csv('submission.csv')
df_submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
