<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install pytorch_lightning
!pip install transformers
!pip install nlp

In [21]:
%%capture
import torch as th
import pytorch_lightning as pl
import nlp
import transformers
import pandas as pd

In [None]:
class IMDBSentimentClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.loss = th.nn.CrossEntropyLoss(reduction='none')

    def prepare_data(self):
        tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

        def _tokenize(x):
            return tokenizer(
                    x['text'], 
                    max_length=20, 
                    truncation = True,
                    pad_to_max_length=True)
            
        def _prepare_ds():
            ds_train, ds_test = nlp.load_dataset('imdb', split=['train[:500]', 'test[:20]'])
            ds_train = ds_train.map(_tokenize, batched=True)
            ds_train.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
            ds_test = ds_test.map(_tokenize, batched=True)
            ds_test.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
            #print('ds_train type - {}'.format(type(ds_train)))
            #print('ds_test type - {}'.format(type(ds_test)))

            return ds_train, ds_test

        self.train_ds, self.test_ds = _prepare_ds()      

    def forward(self, input_ids, masks):
        #print('forward function')
        out = self.model(input_ids, masks)
        #print('out type & length -- {}--{}'.format(type(out),len(out)))
        #print('out -- {}'.format(out))
        logits, = self.model(input_ids, masks)
        #print ('logits- {} -- other stuff'.format(logits))
        return logits

    def training_step(self, batch, batch_idx):
        #print('training step function')
        logits = self.forward(batch['input_ids'],batch['attention_mask'])
        loss = self.loss(logits, batch['label']).mean()

        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        #print('validation step function')
        logits = self.forward(batch['input_ids'], batch['attention_mask'])
        loss = self.loss(logits, batch['label'])
        acc = (logits.argmax(-1) == batch['label']).float()

        return {'loss': loss, 'acc': acc}

    def validation_epoch_end(self, outputs):
        loss = th.cat([o['loss'] for o in outputs], 0).mean()
        acc = th.cat([o['acc'] for o in outputs], 0).mean()
        out = {'val_loss': loss, 'val_acc': acc}
        return {**out, 'log': out}

    def train_dataloader(self):
        #print('train_dataloader')
        return th.utils.data.DataLoader(
                self.train_ds,
                batch_size=1,
                drop_last=True,
                shuffle=True,
                )

    def val_dataloader(self):
        #print('val_dataloader')
        return th.utils.data.DataLoader(
                self.test_ds,
                batch_size=32,
                drop_last=False,
                shuffle=True,
                )

    def configure_optimizers(self):
        return th.optim.SGD(
            self.parameters(),
            lr=0.01,
            momentum=0.9,
        )

# Code Snippets

In [47]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
def _tokenize(x):
    return tokenizer(
            x['text'], 
            max_length=30, 
            truncation = True,
            pad_to_max_length=True)
    
ds_train, ds_test = nlp.load_dataset('imdb', split=['train[:500]', 'test[:20]'])
#ds_train = ds_train.map(_tokenize, batched=True)
#ds_train.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
ds_test = ds_test.map(_tokenize, batched=True)
ds_test_final = ds_test.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
#print('ds_train type - {}'.format(type(ds_train)))
#print('ds_test type - {}'.format(type(ds_test)))



  0%|          | 0/1 [00:00<?, ?it/s]

In [234]:
ds_train_t = ds_train.map(_tokenize, batched=True)



In [None]:
ds_train_t[3]

In [19]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

Saving accnt_issues.csv to accnt_issues.csv


In [22]:
# Dataframe processing
df = pd.read_csv('accnt_issues.csv')
df.head(2)

Unnamed: 0,text,label
0,"Hello, Is there any way possible to link an Xb...",Account Linking Issues - Xbox
1,I'm trying to link my xbox one live account to...,Account Linking Issues - Xbox


In [268]:
int(df.shape[0]*.8)

1067

In [271]:
# splitting dataframe by row index
df_train = df.iloc[:int(df.shape[0]*.8),:]
df_val = df.iloc[int((df.shape[0]*.8))+1:,:]

In [273]:
df_val.shape

(266, 2)

In [147]:
class MyData(th.utils.data.Dataset):
  def __init__(self, data):
    self.data = data
  def __len__(self):
        return len(df['label'].tolist())
  def __getitem__(self, idx):
    return {'label': df['label'].tolist()[idx], 'text':df['text'].tolist()[idx]}

In [148]:
mydataset = MyData(df)

In [149]:
mydataset[:6]

{'label': ['Account Linking Issues - Xbox',
  'Account Linking Issues - Xbox',
  'Account Linking Issues - Nintendo Switch',
  'Account Linking Issues - Xbox',
  'Account Linking Issues - Xbox',
  'Account Linking Issues - PS4'],
 'text': ["Hello, Is there any way possible to link an Xbox account to my Epic? I went to link my account and I already 'previously linked to a different Xbox Live account' over a year ago that I have completely forgotten, I need to link my account in time for the Xbox Cup that's coming up soon.. Otherwise I have to make a new Epic just for that",
  "I'm trying to link my xbox one live account to my epic account.But i keep getting a massage stating it is link to a different account. I am unsure of what account its linked to. \n\nThanks\n\nDaniel",
  'I have recently made an epic account to connect my nintendo switch Fortnite account. Whenever I try to link the accounts it states "Already linked, your nintendo account has been associated with another epic accou

In [253]:
def tok(item):
    return tokenizer(
            item['text'], 
            max_length=20,
            truncation = True,
            padding=True,
            return_tensors="pt")

In [254]:
mydataset_tok = mydataset.map(tok)

In [275]:
mydataset_tok[3]

{'input_ids': tensor([[  101,  2026, 12202,  2444,  4070,  2003,  2025,  4957,  2000,  2505,
          2021,  2009,  7906,  3038,  2009,  4957,  2021,  1045,  2196,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [276]:
mydataset[3]

{'label': 'Account Linking Issues - Xbox',
 'text': "My Xbox live account is not link to anything but it keeps saying it link but I never linked it to any of my accounts and it keeps its linked to a account but it's bot"}

In [262]:
dataloader = th.utils.data.DataLoader(
                                      mydataset_tok,
                                      batch_size=3,
                                      drop_last=True,
                                      shuffle=True,
                                      )

In [263]:
my_iter = iter(dataloader)

In [264]:
next(my_iter)

{'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 
         [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
 
         [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]),
 'input_ids': tensor([[[  101,  7632,  2045,  1010,  1045,  2106,  2025,  3322,  4957,  2039,
            2026,  8680,  4070,  2000,  4830, 16671,  3238,  2043,  1045,   102]],
 
         [[  101,  7632,  2045,  1010,  2651,  1045,  2001,  2633,  2583,  2000,
           13590,  2026,  2048,  8680,  6115,  1006, 12202,  2007,  8827,   102]],
 
         [[  101,  1045,  2253,  2000,  4957,  2026, 12202,  4070,  1998,  2023,
            4471,  2596,  1024,  1000,  2115, 12202,  2444,  4070,  2038,   102]]]),
 'token_type_ids': tensor([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 
         [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
 
         [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]])}

In [70]:
df_dict_test = list(map(tok,df['text'][:5]))

