<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/Sntence_entailment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np
import pandas as pd

In [None]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

In [None]:
!unzip train.zip

In [None]:
train = pd.read_csv("train.csv")

In [None]:
train.head()

In [None]:
#get cell value
train.iloc[1]['premise']
train.iloc[1]['hypothesis']

# Model

In [1]:
%%capture
!pip install pytorch_lightning
!pip install transformers
!pip install nlp

In [3]:
import torch as th
import pytorch_lightning as pl
import nlp
import transformers

In [5]:
#example of tokenizing 2 sentences into
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "HuggingFace is based in NYC"
sequence_b = "Where is HuggingFace based?"
encoded_dict = tokenizer(sequence_a, sequence_b,                     
                         max_length=32, 
                         truncation = True,
                         pad_to_max_length=True); encoded_dict
#decoded = tokenizer.decode(encoded_dict["input_ids"])

{'input_ids': [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}

In [None]:
from nlp import load_dataset
datasets = load_dataset('imdb')
print(datasets)

In [None]:
class IMDBSentimentClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.loss = th.nn.CrossEntropyLoss(reduction='none')

    def prepare_data(self):
        tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

        def _tokenize(x):
            return tokenizer(
                    x['text'], 
                    max_length=20, 
                    truncation = True,
                    pad_to_max_length=True)
            
        def _prepare_ds():
            ds_train, ds_test = nlp.load_dataset('imdb', split=['train[:500]', 'test[:20]'])
            ds_train = ds_train.map(_tokenize, batched=True)
            ds_train.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
            ds_test = ds_test.map(_tokenize, batched=True)
            ds_test.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
            #print('ds_train type - {}'.format(type(ds_train)))
            #print('ds_test type - {}'.format(type(ds_test)))

            return ds_train, ds_test

        self.train_ds, self.test_ds = _prepare_ds()      

    def forward(self, input_ids, masks):
        #print('forward function')
        out = self.model(input_ids, masks)
        #print('out type & length -- {}--{}'.format(type(out),len(out)))
        #print('out -- {}'.format(out))
        logits, = self.model(input_ids, masks)
        #print ('logits- {} -- other stuff'.format(logits))
        return logits

    def training_step(self, batch, batch_idx):
        #print('training step function')
        logits = self.forward(batch['input_ids'],batch['attention_mask'])
        loss = self.loss(logits, batch['label']).mean()

        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        #print('validation step function')
        logits = self.forward(batch['input_ids'], batch['attention_mask'])
        loss = self.loss(logits, batch['label'])
        acc = (logits.argmax(-1) == batch['label']).float()

        return {'loss': loss, 'acc': acc}

    def validation_epoch_end(self, outputs):
        loss = th.cat([o['loss'] for o in outputs], 0).mean()
        acc = th.cat([o['acc'] for o in outputs], 0).mean()
        out = {'val_loss': loss, 'val_acc': acc}
        return {**out, 'log': out}

    def train_dataloader(self):
        #print('train_dataloader')
        return th.utils.data.DataLoader(
                self.train_ds,
                batch_size=1,
                drop_last=True,
                shuffle=True,
                )

    def val_dataloader(self):
        #print('val_dataloader')
        return th.utils.data.DataLoader(
                self.test_ds,
                batch_size=32,
                drop_last=False,
                shuffle=True,
                )

    def configure_optimizers(self):
        return th.optim.SGD(
            self.parameters(),
            lr=0.01,
            momentum=0.9,
        )