In [41]:
import pandas as pd 
import tqdm
import multiprocessing
import sh

import pytorch_lightning as pl
import torch as th
from pytorch_lightning.loggers import TensorBoardLogger
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

import transformers

from sklearn.model_selection import train_test_split

In [42]:
# th.cuda.get_device_name(0)
# th.cuda.is_available()

In [43]:
sh.rm('-r', '-f', 'logs')
sh.mkdir('logs')



In [44]:
model_name = 'bert-base-uncased'
batch_size = 8
lr = 1e-2
eps = 1e-8

In [45]:
df = pd.read_csv('../data/train.csv', index_col='id')

In [46]:
df.head()

Unnamed: 0_level_0,qid1,qid2,question1,question2,is_duplicate
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [47]:
df.dropna(inplace=True)

In [48]:
df.count()

qid1            404287
qid2            404287
question1       404287
question2       404287
is_duplicate    404287
dtype: int64

In [49]:
# tokenizer = transformers.BertTokenizer.from_pretrained(model)
# tokenize = lambda x: tokenizer.tokenize(x)

In [50]:
# df['q1_len'] = df['question1'].apply(lambda x: len(tokenize(x)))
# df['q2_len'] = df['question2'].apply(lambda x: len(tokenize(x)))

In [51]:
# df['join_length'] = df['q1_len'] + df['q2_len']

In [52]:
# max_length = df['join_length'].max()

In [53]:
df = df.sample(frac=0.05, replace=False, random_state=1)

In [74]:
class QuoraQuestionSimilarity(pl.LightningModule):
    def __init__(self, X, y):
        super().__init__()
        self.x = X
        self.y = y 
        self.model = transformers.BertForSequenceClassification.from_pretrained(model_name)
#         self.loss = th.nn.CrossEntropyLoss(reduction='none')
        
    def prepare_data(self):
        tokenizer = tokenizer = transformers.BertTokenizer.from_pretrained(model_name, num_labels=2)
        
        def _split_ds():
            self.x_train, self.x_val, self.y_train, self.y_val = train_test_split(self.x, self.y, test_size=0.2)
            self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_train, self.y_train, test_size=0.25)
    
        def _tokenize(x):
            return tokenizer.encode_plus(x['question1'], x['question2'], 
                                         max_length=330, pad_to_max_length=True,
                                         return_attention_mask=True, return_tensors='pt', 
                                         truncation=True)
        
        def _prepare(x, y):
            input_ids = []
            attention_masks = []
            token_type_ids = []
            for _, row in x.iterrows():
                encoded_data = _tokenize(row)
                input_ids.append(encoded_data['input_ids'])
                token_type_ids.append(encoded_data["token_type_ids"])
                attention_masks.append(encoded_data['attention_mask'])
                
            input_ids = th.cat(input_ids, dim=0)
            token_type_ids = th.cat(token_type_ids, dim=0)
            attention_masks = th.cat(attention_masks, dim=0)
            labels = th.tensor(y.values)
            return TensorDataset(input_ids, attention_masks, token_type_ids, labels)
        
        _split_ds()
        self.train_ds = _prepare(self.x_train, self.y_train)
        self.val_ds = _prepare(self.x_val, self.y_val)
        self.test_ds = _prepare(self.x_test, self.y_test)
    
    def train_dataloader(self):
        return DataLoader(self.train_ds, sampler = RandomSampler(self.train_ds), 
                          batch_size = batch_size, pin_memory=True, 
                          num_workers=4)
    
    def val_dataloader(self):
        return DataLoader(self.val_ds, batch_size=batch_size, 
                          sampler = RandomSampler(self.test_ds), 
                          pin_memory=True, num_workers=4)  
    
    def test_dataloader(self):
        return DataLoader(self.test_ds, batch_size=batch_size, 
                          sampler = RandomSampler(self.test_ds), 
                          pin_memory=True, num_workers=4)
    
    def configure_optimizers(self):
        return th.optim.AdamW(
            self.parameters(),
            lr=lr,
            eps = eps
        )
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_masks, token_type_ids, labels = batch
        loss, logits = self.forward(input_ids, token_type_ids, attention_masks, labels)
        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        input_ids, attention_masks, token_type_ids, labels = batch
        loss, logits = self.forward(input_ids, token_type_ids, attention_masks, labels)
        labels_hat = th.argmax(logits, dim=1)
        hit_rate = th.sum(labels == labels_hat)
        return {'loss': loss, 'hit_rate': hit_rate}
    
    def test_step(self, batch, batch_idx):
        input_ids, attention_masks, token_type_ids, labels = batch
        loss, logits = self.forward(input_ids, token_type_ids, attention_masks, labels)
        labels_hat = th.argmax(logits, dim=1)
        hit_rate = th.sum(labels == labels_hat)
        return {'test_loss': loss, 'test_hit_rate': hit_rate}
        
    def validation_epoch_end(self, outputs):
        loss = th.mean(th.stack([o['loss'].float() for o in outputs]))
        acc = th.mean(th.stack([o['hit_rate'].float() for o in outputs]))
        out = {'val_loss': loss, 'val_acc': acc}
        return {**out, 'log':out}
    
    def test_epoch_end(self, outputs):
        loss = th.mean(th.stack([o['loss'].float() for o in outputs]))
        acc = th.mean(th.stack([o['hit_rate'].float() for o in outputs]))
        out = {'val_loss': loss, 'val_acc': acc}
        return {**out, 'log':out}

    
    def forward(self, input_ids, token_type_ids, attention_masks, labels):
        loss, logits = self.model(input_ids, token_type_ids=token_type_ids, 
                             attention_mask=attention_masks, 
                             labels=labels)
        return loss, logits
    

In [76]:
# (1 if th.cuda.is_available() else 0)
X = df[['question1', 'question2']]
y = df['is_duplicate']

model = QuoraQuestionSimilarity(X, y)
trainer = pl.Trainer(
    default_root_dir='logs',
#     gpus=(1 if th.cuda.is_available() else 0),
    gpus=0,
    max_epochs=2,
    logger=TensorBoardLogger('logs/', name='qqs', version=0)
)
trainer.fit(model)

GPU available: True, used: False
TPU available: False, using: 0 TPU cores

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

[{'loss': tensor(1.2091), 'hit_rate': tensor(2)}, {'loss': tensor(0.2464), 'hit_rate': tensor(2)}]
{'val_loss': tensor(0.7277), 'val_acc': tensor(2.)}


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…






1