<a href="https://colab.research.google.com/github/pastrop/kaggle/blob/master/classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install pytorch_lightning
!pip install transformers
!pip install nlp

In [2]:
%%capture
import torch as th
import pytorch_lightning as pl
import nlp
import transformers
import pandas as pd

In [None]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

In [None]:
# Dataframe processing
df = pd.read_csv('accnt_issues.csv')
df.head(2)

In [None]:
class MulticlassClassifier(pl.LightningModule):
    def __init__(self,df):
        super().__init__()
        self.df = df
        self.model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.loss = th.nn.CrossEntropyLoss(reduction='none')

    def prepare_data(self):
        tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

        def _tokenize(x):
            tmp = {'label':item['label']}
            return {**tokenizer.batch_encode_plus(
                                                  item['text'], 
                                                  max_length=20,
                                                  truncation = True,
                                                  padding=True,
                                                  return_tensors="pt"),**tmp}
                                          
        def _prepare_ds(self):
          df_sm = self.df.iloc[:6,:]
          df_train = self.df.iloc[:int(df.shape[0]*.8),:]
          df_test = self.df.iloc[int((df.shape[0]*.8))+1:,:]

          #digitizing labels
          def labels(self,data):
            labels = data['label'].unique()

            label_dict = {}
            for index, possible_label in enumerate(labels):
                label_dict[possible_label] = index
            label_dict

          #custom torch dataset class
          class MyData(th.utils.data.Dataset):
            def __init__(self, data):
              self.data = data
            def __len__(self):
                  return len(self.data['label'].tolist())
            def __getitem__(self, idx):
              return {'label': self.data['label'].tolist()[idx], 'text':self.data['text'].tolist()[idx]}

          df_train['label'] = df_train['label'].replace(labels(df_train))
          df_test['label'] = df_test['label'].replace(labels(df_test))
          df_sm['label'] = df_sm['label'].replace(labels(df_sm))
          
          mydataset_train = MyData(df_train)
          mydataset_test = MyData(df_test)
          mydataset_sm = MyData(df_sm)

          mydataset_train = mydataset_train.map(_tokenize)
          mydataset_test = mydataset_test.map(_tokenize)
          mydataset_sm = mydataset_sm.map(_tokenize)

          return mydataset_train, mydataset_test, mydataset_sm

        self.train_ds, self.test_ds, self.small_ds = _prepare_ds()      

    def train_dataloader(self):
        #print('train_dataloader')
        return th.utils.data.DataLoader(
                self.train_ds,
                batch_size=1,
                drop_last=True,
                shuffle=True,
                )

    def val_dataloader(self):
        #print('val_dataloader')
        return th.utils.data.DataLoader(
                self.test_ds,
                batch_size=32,
                drop_last=False,
                shuffle=True,
                )


    def forward(self, input_ids, masks):
        #print('forward function')
        out = self.model(input_ids, masks)
        #print('out type & length -- {}--{}'.format(type(out),len(out)))
        #print('out -- {}'.format(out))
        logits, = self.model(input_ids, masks)
        #print ('logits- {} -- other stuff'.format(logits))
        return logits

    def training_step(self, batch, batch_idx):
        #print('training step function')
        logits = self.forward(batch['input_ids'],batch['attention_mask'])
        loss = self.loss(logits, batch['label']).mean()

        return {'loss': loss, 'log': {'train_loss': loss}}

    def validation_step(self, batch, batch_idx):
        #print('validation step function')
        logits = self.forward(batch['input_ids'], batch['attention_mask'])
        loss = self.loss(logits, batch['label'])
        acc = (logits.argmax(-1) == batch['label']).float()

        return {'loss': loss, 'acc': acc}

    def validation_epoch_end(self, outputs):
        loss = th.cat([o['loss'] for o in outputs], 0).mean()
        acc = th.cat([o['acc'] for o in outputs], 0).mean()
        out = {'val_loss': loss, 'val_acc': acc}
        return {**out, 'log': out}

    def configure_optimizers(self):
        return th.optim.SGD(
            self.parameters(),
            lr=0.01,
            momentum=0.9,
        )

In [None]:
def model_fit():
    model = MulticlassClassifier()
    trainer = pl.Trainer(
        #default_root_dir='logs',
        gpus=(1 if th.cuda.is_available() else 0),
        max_epochs=4,
        fast_dev_run=False,
        logger=pl.loggers.TensorBoardLogger('logs/', name='imdb'),
    )
    trainer.fit(model)

!rm -rf ./logs/

model_fit()

In [None]:
#model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

# Implementation 

In [None]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

In [None]:
# Dataframe processing
df = pd.read_csv('accnt_issues.csv')
df.head(2)

In [148]:
# splitting dataframe by row index
df_sm = df.iloc[:6,:]
df_train = df.iloc[:int(df.shape[0]*.8),:]
df_val = df.iloc[int((df.shape[0]*.8))+1:,:]

In [149]:
#digitizing labels
labels = df_sm['label'].unique()

label_dict = {}
for index, possible_label in enumerate(labels):
    label_dict[possible_label] = index
label_dict

{0: 0, 1: 1, 2: 2}

In [150]:
#tokenizer
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [159]:
encoded_data_train = tokenizer.batch_encode_plus(
    df_sm['text'].values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=20, 
    return_tensors='pt'
)



In [164]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = th.tensor(df_sm['label'].values.tolist())
dataset_train = th.utils.data.TensorDataset(input_ids_train, attention_masks_train, labels_train)

In [167]:
dataloader_train = th.utils.data.DataLoader(
                                      dataset_train,
                                      batch_size=2,
                                      drop_last=True,
                                      shuffle=False,
                                      )

In [177]:
inp = []
for batch in dataloader_train:  
    inputs = {'input_ids':      batch[0],
              'attention_mask': batch[1],
              'labels':         batch[2],
    }
    inp.append(inputs)

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

# Code Snippets

In [3]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
def _tokenize(x):
    return tokenizer(
            x['text'], 
            max_length=30, 
            truncation = True,
            pad_to_max_length=True)
    
ds_train, ds_test = nlp.load_dataset('imdb', split=['train[:500]', 'test[:20]'])
#ds_train = ds_train.map(_tokenize, batched=True)
#ds_train.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
ds_test = ds_test.map(_tokenize, batched=True)
ds_test_final = ds_test.set_format(type='torch',columns = ['input_ids','label','attention_mask'])
#print('ds_train type - {}'.format(type(ds_train)))
#print('ds_test type - {}'.format(type(ds_test)))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.06 MiB, post-processed: Unknown sizetotal: 207.28 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/76cdbd7249ea3548c928bbf304258dab44d09cd3638d9da8d42480d1d1be3743...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/76cdbd7249ea3548c928bbf304258dab44d09cd3638d9da8d42480d1d1be3743. Subsequent calls will reuse this data.




  0%|          | 0/1 [00:00<?, ?it/s]

In [234]:
ds_train_t = ds_train.map(_tokenize, batched=True)



In [None]:
ds_train_t[3]

In [3]:
# file upload while using Google Colab
from google.colab import files
uploaded = files.upload()

Saving accnt_issues.csv to accnt_issues.csv


In [4]:
# Dataframe processing
df = pd.read_csv('accnt_issues.csv')
df.head(2)

Unnamed: 0,text,label
0,"Hello, Is there any way possible to link an Xb...",Account Linking Issues - Xbox
1,I'm trying to link my xbox one live account to...,Account Linking Issues - Xbox


In [5]:
int(df.shape[0]*.8)

1067

In [6]:
# splitting dataframe by row index
df_sm = df.iloc[:6,:]
df_train = df.iloc[:int(df.shape[0]*.8),:]
df_val = df.iloc[int((df.shape[0]*.8))+1:,:]

In [7]:
df_sm.shape

(6, 2)

In [8]:
labels = df_train['label'].unique().tolist()

In [9]:
labels = df_sm['label'].unique()

label_dict = {}
for index, possible_label in enumerate(labels):
    label_dict[possible_label] = index
label_dict

{'Account Linking Issues - Nintendo Switch': 1,
 'Account Linking Issues - PS4': 2,
 'Account Linking Issues - Xbox': 0}

In [10]:
df_sm['label'] = df_sm['label'].replace(label_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [11]:
df_sm

Unnamed: 0,text,label
0,"Hello, Is there any way possible to link an Xb...",0
1,I'm trying to link my xbox one live account to...,0
2,I have recently made an epic account to connec...,1
3,My Xbox live account is not link to anything b...,0
4,Hi there\n Someone got hold off my account so ...,0
5,hey. so my epic account was linked to my ps4 a...,2


In [12]:
class MyData(th.utils.data.Dataset):
  def __init__(self, data):
    self.data = data
  def __len__(self):
        return len(self.data['label'].tolist())
  def __getitem__(self, idx):
    return {'label': self.data['label'].tolist()[idx], 'text':self.data['text'].tolist()[idx]}

In [13]:
mydataset = MyData(df_sm)

In [14]:
mydataset[4]

{'label': 0,
 'text': "Hi there\n Someone got hold off my account so they toke my epic games account and linked it to a Xbox live account, I recently recovered my account and it doesn't let me link to my Xbox live account it says it can only be linked back to the old account which is not mine it was the person who had access to my account can you help me link back to my account please I would like to continue using my account and I spent a lot of money and time on. I would really appreciate if you can help me link it back to Xbox live account."}

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
def tok(item):
    tmp = {'label':item['label']}
    return {**tokenizer(
            item['text'], 
            max_length=20,
            truncation = True,
            padding=True,
            return_tensors="pt"),**tmp}

In [17]:
mydataset_tok = mydataset.map(tok)

In [19]:
mydataset_tok[:3]

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  7592,  1010,  2003,  2045,  2151,  2126,  2825,  2000,  4957,
           2019, 12202,  4070,  2000,  2026,  8680,  1029,  1045,  2253,   102],
         [  101,  1045,  1005,  1049,  2667,  2000,  4957,  2026, 12202,  2028,
           2444,  4070,  2000,  2026,  8680,  4070,  1012,  2021,  1045,   102],
         [  101,  1045,  2031,  3728,  2081,  2019,  8680,  4070,  2000,  7532,
           2026, 10022,  6942,  3481,  3490,  2618,  4070,  1012,  7188,   102]]),
 'label': [0, 0, 1],
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [20]:
mydataset_tok[4]

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[ 101, 7632, 2045, 2619, 2288, 2907, 2125, 2026, 4070, 2061, 2027, 2000,
          3489, 2026, 8680, 2399, 4070, 1998, 5799,  102]]),
 'label': 0,
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [21]:
dataloader = th.utils.data.DataLoader(
                                      mydataset_tok,
                                      batch_size=3,
                                      drop_last=True,
                                      shuffle=False,
                                      )

In [22]:
out = []
for batch in dataloader:  
    out.append(batch)

In [23]:
out

[{'attention_mask': tensor([[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
  
          [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
  
          [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]]),
  'input_ids': tensor([[[  101,  7592,  1010,  2003,  2045,  2151,  2126,  2825,  2000,  4957,
             2019, 12202,  4070,  2000,  2026,  8680,  1029,  1045,  2253,   102]],
  
          [[  101,  1045,  1005,  1049,  2667,  2000,  4957,  2026, 12202,  2028,
             2444,  4070,  2000,  2026,  8680,  4070,  1012,  2021,  1045,   102]],
  
          [[  101,  1045,  2031,  3728,  2081,  2019,  8680,  4070,  2000,  7532,
             2026, 10022,  6942,  3481,  3490,  2618,  4070,  1012,  7188,   102]]]),
  'label': tensor([0, 0, 1]),
  'token_type_ids': tensor([[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
  
          [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
  
          [[0, 0, 0, 0, 0, 0, 0

In [70]:
df_dict_test = list(map(tok,df['text'][:5]))



In [None]:
def train(self, num_epochs=3, batch_size=32):
        """
        Trains the LR model.

        :param num_epochs: (int) number of epochs.
        """
        batch_losses = []
        # We train only on cells that do not have their initial value as NULL.
        X_train, Y_train = self._X.index_select(0, self._train_idx), self._Y.index_select(0, self._train_idx)
        torch_ds = TensorDataset(X_train, Y_train)

        # Main training loop.
        for epoch_idx in range(1, num_epochs+1):
            logging.debug("Logistic: epoch %d", epoch_idx)
            batch_cnt = 0
            for batch_X, batch_Y in tqdm(DataLoader(torch_ds, batch_size=batch_size)):
                batch_pred = self.forward(batch_X)
                batch_loss = self._loss(batch_pred, batch_Y.reshape(-1,1))
                batch_losses.append(float(batch_loss))
                self.zero_grad()
                batch_loss.backward()
                self._optimizer.step()
                batch_cnt += 1
            logging.debug('Logistic: average batch loss: %f', sum(batch_losses[-1 * batch_cnt:]) / batch_cnt)
        return batch_losses 