In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re



In [2]:
# Load data into DataFrame 
df = pd.read_csv('balanced_data_combined.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,0,Drasko they didn't cook half a bird you idiot ...,1
1,1,Hopefully someone cooks Drasko in the next ep ...,1
2,2,of course you were born in serbia...you're as ...,1
3,3,These girls are the equivalent of the irritati...,1
4,4,RT @YesYoureRacist: At least you're only a tin...,1


In [3]:
df['class'].unique()

array([1, 0])

In [4]:
df1 = df.rename(columns={'text': 'tweet', 'class': 'target'})
df1.head()

Unnamed: 0.1,Unnamed: 0,tweet,target
0,0,Drasko they didn't cook half a bird you idiot ...,1
1,1,Hopefully someone cooks Drasko in the next ep ...,1
2,2,of course you were born in serbia...you're as ...,1
3,3,These girls are the equivalent of the irritati...,1
4,4,RT @YesYoureRacist: At least you're only a tin...,1


In [5]:
df1= df1.drop('Unnamed: 0', axis=1)
df1.dropna(subset=['tweet'], inplace=True)

# Clean tweets

In [6]:
def preprocess_tweet(df, col):
    
    df[col] = df[col].apply(lambda x: re.sub(r'@[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'&[\S]+?;', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'#', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'(\bRT\b|\bQT\b)', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'http[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'[^\w\s]', r'', str(x)))
    df[col] = df[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df[col] = df[col].apply(lambda x: re.sub(r'\w*\d\w*', r' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'\s\s+', ' ', str(x)))

In [7]:
preprocess_tweet(df1, 'tweet')
df1.head()

Unnamed: 0,tweet,target
0,drasko they didnt cook half a bird you idiot mkr,1
1,hopefully someone cooks drasko in the next ep ...,1
2,of course you were born in serbiayoure as fuck...,1
3,these girls are the equivalent of the irritati...,1
4,at least youre only a tiny bit racist im not r...,1


# Bert tokenizer 

In [8]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [9]:
df1["sent_bert_token_length"] = df1["tweet"].apply(lambda x: len(tokenizer(x, add_special_tokens=False)["input_ids"]))

In [10]:
df1.head()

Unnamed: 0,tweet,target,sent_bert_token_length
0,drasko they didnt cook half a bird you idiot mkr,1,14
1,hopefully someone cooks drasko in the next ep ...,1,13
2,of course you were born in serbiayoure as fuck...,1,17
3,these girls are the equivalent of the irritati...,1,18
4,at least youre only a tiny bit racist im not r...,1,16


# Bert Stuff

In [11]:
import torch
from tqdm import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import json

In [12]:
class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5 
    batch_size = 6
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True 
    return_attention_mask=True 
    pad_to_max_length=True 
    do_lower_case=False
    return_tensors='pt'

config = Config()

In [13]:
params = {"seed_val": config.seed_val,
    "device":str(config.device),
    "epochs":config.epochs, 
    "batch_size":config.batch_size,
    "seq_length":config.seq_length,
    "lr":config.lr,
    "eps":config.eps,
    "pretrained_model": config.pretrained_model,
    "test_size":config.test_size,
    "random_state":config.random_state,
    "add_special_tokens":config.add_special_tokens,
    "return_attention_mask":config.return_attention_mask,
    "pad_to_max_length":config.pad_to_max_length,
    "do_lower_case":config.do_lower_case,
    "return_tensors":config.return_tensors,
         }

In [14]:
import random

device = config.device

random.seed(config.seed_val)
np.random.seed(config.seed_val)
torch.manual_seed(config.seed_val)
torch.cuda.manual_seed_all(config.seed_val)

In [15]:
#split train test
from sklearn.model_selection import train_test_split

train_df_, val_df = train_test_split(df1, 
                                    test_size=0.20, 
                                    random_state=config.random_state, stratify=df1['target'].values)

In [16]:
train_df_.head()

Unnamed: 0,tweet,target,sent_bert_token_length
1370,and that proves what,1,4
5844,my teacher always says has a man ever abused h...,0,26
1620,whut btw im no sexist i jus mean same woman ba...,1,29
4878,charlie sheen wants to return to two and a hal...,0,11
3162,waits for fanduel to hit me with the jig,0,12


In [17]:
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model, 
                                          do_lower_case=config.do_lower_case)

In [18]:
encoded_data_train = tokenizer.batch_encode_plus(
    train_df_['tweet'].values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length, 
    max_length=config.seq_length, 
    return_tensors=config.return_tensors
)
encoded_data_val = tokenizer.batch_encode_plus(
    val_df['tweet'].values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length,
    max_length=config.seq_length, 
    return_tensors=config.return_tensors
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [19]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df_['target'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_df['target'].values)

In [20]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [21]:
len(dataset_train)

6668

In [22]:
len(dataset_val)

1667

In [23]:
label_dict ={'hate': 1, 'not_hate':0}

In [24]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [26]:
batch_size = 32

# We Need two different dataloder
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                              sampler=RandomSampler(dataset_val),
                              batch_size=batch_size)

In [27]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [28]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps = 1e-8
)



In [29]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)

In [30]:
from sklearn.metrics import f1_score

In [31]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [32]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

In [33]:
print(config.device)

cpu


In [34]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [35]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()          # Sending our model in Training mode
    
    loss_train_total = 0   # Setting the training loss to zero initially

    # Setting up the Progress bar to Moniter the progress of training
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad() # As we not working with thew RNN's
        
        # As our dataloader has '3' iteams so batches will be the Tuple of '3'
        batch = tuple(b.to(device) for b in batch)
        
        # INPUTS
        # Pulling out the inputs in the form of dictionary
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        # OUTPUTS
        outputs = model(**inputs) # '**' Unpacking the dictionary stright into the input
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()           # backpropagation

        # Gradient Clipping -- Taking the Grad. & gives it a NORM value ~ 1 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|                                                     | 0/3 [00:00<?, ?it/s]
Epoch 1:   0%|                                          | 0/209 [00:00<?, ?it/s][A
Epoch 1:   0%|                     | 0/209 [00:54<?, ?it/s, training_loss=0.271][A
Epoch 1:   0%|           | 1/209 [00:54<3:10:31, 54.96s/it, training_loss=0.271][A
Epoch 1:   0%|           | 1/209 [01:47<3:10:31, 54.96s/it, training_loss=0.233][A
Epoch 1:   1%|           | 2/209 [01:47<3:04:40, 53.53s/it, training_loss=0.233][A
Epoch 1:   1%|           | 2/209 [02:39<3:04:40, 53.53s/it, training_loss=0.231][A
Epoch 1:   1%|▏          | 3/209 [02:39<3:01:07, 52.75s/it, training_loss=0.231][A
Epoch 1:   1%|▏          | 3/209 [03:30<3:01:07, 52.75s/it, training_loss=0.228][A
Epoch 1:   2%|▏          | 4/209 [03:30<2:58:38, 52.28s/it, training_loss=0.228][A
Epoch 1:   2%|▏          | 4/209 [04:21<2:58:38, 52.28s/it, training_loss=0.244][A
Epoch 1:   2%|▎          | 5/209 [04:21<2:55:37, 51.65s/it, training_loss=0.244

Epoch 1:  23%|██▎       | 48/209 [41:46<2:20:01, 52.18s/it, training_loss=0.204][A
Epoch 1:  23%|██▎       | 48/209 [42:41<2:20:01, 52.18s/it, training_loss=0.165][A
Epoch 1:  23%|██▎       | 49/209 [42:41<2:21:46, 53.17s/it, training_loss=0.165][A
Epoch 1:  23%|██▎       | 49/209 [45:35<2:21:46, 53.17s/it, training_loss=0.192][A
Epoch 1:  24%|██▍       | 50/209 [45:35<3:56:18, 89.17s/it, training_loss=0.192][A
Epoch 1:  24%|██▍       | 50/209 [46:28<3:56:18, 89.17s/it, training_loss=0.190][A
Epoch 1:  24%|██▍       | 51/209 [46:28<3:26:51, 78.55s/it, training_loss=0.190][A
Epoch 1:  24%|██▍       | 51/209 [47:17<3:26:51, 78.55s/it, training_loss=0.195][A
Epoch 1:  25%|██▍       | 52/209 [47:17<3:02:12, 69.63s/it, training_loss=0.195][A
Epoch 1:  25%|██▍       | 52/209 [48:08<3:02:12, 69.63s/it, training_loss=0.160][A
Epoch 1:  25%|██▌       | 53/209 [48:08<2:46:01, 63.85s/it, training_loss=0.160][A
Epoch 1:  25%|██▌       | 53/209 [48:58<2:46:01, 63.85s/it, training_loss=0.

Epoch 1:  46%|███▋    | 96/209 [1:27:22<1:40:10, 53.19s/it, training_loss=0.127][A
Epoch 1:  46%|███▋    | 97/209 [1:27:22<1:40:02, 53.60s/it, training_loss=0.127][A
Epoch 1:  46%|███▋    | 97/209 [1:28:19<1:40:02, 53.60s/it, training_loss=0.128][A
Epoch 1:  47%|███▊    | 98/209 [1:28:19<1:41:13, 54.72s/it, training_loss=0.128][A
Epoch 1:  47%|███▊    | 98/209 [1:29:13<1:41:13, 54.72s/it, training_loss=0.125][A
Epoch 1:  47%|███▊    | 99/209 [1:29:13<1:39:54, 54.49s/it, training_loss=0.125][A
Epoch 1:  47%|███▊    | 99/209 [1:30:08<1:39:54, 54.49s/it, training_loss=0.108][A
Epoch 1:  48%|███▎   | 100/209 [1:30:08<1:39:15, 54.64s/it, training_loss=0.108][A
Epoch 1:  48%|███▎   | 100/209 [1:31:01<1:39:15, 54.64s/it, training_loss=0.165][A
Epoch 1:  48%|███▍   | 101/209 [1:31:01<1:37:16, 54.04s/it, training_loss=0.165][A
Epoch 1:  48%|███▍   | 101/209 [1:31:50<1:37:16, 54.04s/it, training_loss=0.112][A
Epoch 1:  49%|███▍   | 102/209 [1:31:50<1:33:51, 52.63s/it, training_loss=0.

Epoch 1:  69%|████▊  | 145/209 [2:10:52<1:03:20, 59.38s/it, training_loss=0.055][A
Epoch 1:  69%|████▊  | 145/209 [2:11:48<1:03:20, 59.38s/it, training_loss=0.131][A
Epoch 1:  70%|████▉  | 146/209 [2:11:48<1:01:24, 58.49s/it, training_loss=0.131][A
Epoch 1:  70%|████▉  | 146/209 [2:12:45<1:01:24, 58.49s/it, training_loss=0.182][A
Epoch 1:  70%|██████▎  | 147/209 [2:12:45<59:53, 57.97s/it, training_loss=0.182][A
Epoch 1:  70%|██████▎  | 147/209 [2:13:43<59:53, 57.97s/it, training_loss=0.103][A
Epoch 1:  71%|██████▎  | 148/209 [2:13:43<58:46, 57.81s/it, training_loss=0.103][A
Epoch 1:  71%|██████▎  | 148/209 [2:14:36<58:46, 57.81s/it, training_loss=0.092][A
Epoch 1:  71%|██████▍  | 149/209 [2:14:36<56:30, 56.52s/it, training_loss=0.092][A
Epoch 1:  71%|██████▍  | 149/209 [2:15:27<56:30, 56.52s/it, training_loss=0.093][A
Epoch 1:  72%|██████▍  | 150/209 [2:15:27<53:48, 54.72s/it, training_loss=0.093][A
Epoch 1:  72%|██████▍  | 150/209 [2:16:19<53:48, 54.72s/it, training_loss=0.

Epoch 1:  92%|████████▎| 193/209 [2:54:28<14:07, 52.99s/it, training_loss=0.085][A
Epoch 1:  93%|████████▎| 194/209 [2:54:28<13:25, 53.71s/it, training_loss=0.085][A
Epoch 1:  93%|████████▎| 194/209 [2:55:30<13:25, 53.71s/it, training_loss=0.058][A
Epoch 1:  93%|████████▍| 195/209 [2:55:30<13:07, 56.22s/it, training_loss=0.058][A
Epoch 1:  93%|████████▍| 195/209 [2:56:27<13:07, 56.22s/it, training_loss=0.109][A
Epoch 1:  94%|████████▍| 196/209 [2:56:27<12:16, 56.69s/it, training_loss=0.109][A
Epoch 1:  94%|████████▍| 196/209 [2:57:21<12:16, 56.69s/it, training_loss=0.094][A
Epoch 1:  94%|████████▍| 197/209 [2:57:21<11:10, 55.91s/it, training_loss=0.094][A
Epoch 1:  94%|████████▍| 197/209 [2:58:12<11:10, 55.91s/it, training_loss=0.076][A
Epoch 1:  95%|████████▌| 198/209 [2:58:12<09:56, 54.21s/it, training_loss=0.076][A
Epoch 1:  95%|████████▌| 198/209 [2:59:04<09:56, 54.21s/it, training_loss=0.081][A
Epoch 1:  95%|████████▌| 199/209 [2:59:04<08:57, 53.77s/it, training_loss=0.


Epoch 1
Training loss: 0.42449829189115734



  0%|                                                    | 0/53 [00:00<?, ?it/s][A
  2%|▊                                           | 1/53 [00:07<06:45,  7.81s/it][A
  4%|█▋                                          | 2/53 [00:15<06:33,  7.71s/it][A
  6%|██▍                                         | 3/53 [00:23<06:33,  7.86s/it][A
  8%|███▎                                        | 4/53 [00:31<06:25,  7.86s/it][A
  9%|████▏                                       | 5/53 [00:39<06:14,  7.79s/it][A
 11%|████▉                                       | 6/53 [00:47<06:09,  7.87s/it][A
 13%|█████▊                                      | 7/53 [00:54<05:57,  7.78s/it][A
 15%|██████▋                                     | 8/53 [01:02<05:48,  7.74s/it][A
 17%|███████▍                                    | 9/53 [01:09<05:39,  7.72s/it][A
 19%|████████                                   | 10/53 [01:17<05:32,  7.72s/it][A
 21%|████████▉                                  | 11/53 [01:25<05:23,  7.71

Validation loss: 0.2428572193350432
F1 Score (Weighted): 0.9044577623612672



Epoch 2:   0%|                                          | 0/209 [00:00<?, ?it/s][A
Epoch 2:   0%|                     | 0/209 [00:54<?, ?it/s, training_loss=0.091][A
Epoch 2:   0%|           | 1/209 [00:54<3:09:26, 54.65s/it, training_loss=0.091][A
Epoch 2:   0%|           | 1/209 [01:56<3:09:26, 54.65s/it, training_loss=0.033][A
Epoch 2:   1%|           | 2/209 [01:56<3:23:43, 59.05s/it, training_loss=0.033][A
Epoch 2:   1%|           | 2/209 [02:50<3:23:43, 59.05s/it, training_loss=0.072][A
Epoch 2:   1%|▏          | 3/209 [02:50<3:13:55, 56.48s/it, training_loss=0.072][A
Epoch 2:   1%|▏          | 3/209 [03:44<3:13:55, 56.48s/it, training_loss=0.095][A
Epoch 2:   2%|▏          | 4/209 [03:44<3:09:32, 55.48s/it, training_loss=0.095][A
Epoch 2:   2%|▏          | 4/209 [04:37<3:09:32, 55.48s/it, training_loss=0.078][A
Epoch 2:   2%|▎          | 5/209 [04:37<3:05:59, 54.70s/it, training_loss=0.078][A
Epoch 2:   2%|▎          | 5/209 [05:27<3:05:59, 54.70s/it, training_loss=0

Epoch 2:  23%|██▎       | 48/209 [43:42<2:27:26, 54.94s/it, training_loss=0.019][A
Epoch 2:  23%|██▎       | 49/209 [43:42<2:25:35, 54.60s/it, training_loss=0.019][A
Epoch 2:  23%|██▎       | 49/209 [44:37<2:25:35, 54.60s/it, training_loss=0.064][A
Epoch 2:  24%|██▍       | 50/209 [44:37<2:25:00, 54.72s/it, training_loss=0.064][A
Epoch 2:  24%|██▍       | 50/209 [45:28<2:25:00, 54.72s/it, training_loss=0.040][A
Epoch 2:  24%|██▍       | 51/209 [45:28<2:20:51, 53.49s/it, training_loss=0.040][A
Epoch 2:  24%|██▍       | 51/209 [46:20<2:20:51, 53.49s/it, training_loss=0.089][A
Epoch 2:  25%|██▍       | 52/209 [46:20<2:19:10, 53.19s/it, training_loss=0.089][A
Epoch 2:  25%|██▍       | 52/209 [47:11<2:19:10, 53.19s/it, training_loss=0.041][A
Epoch 2:  25%|██▌       | 53/209 [47:11<2:16:21, 52.45s/it, training_loss=0.041][A
Epoch 2:  25%|██▌       | 53/209 [48:01<2:16:21, 52.45s/it, training_loss=0.069][A
Epoch 2:  26%|██▌       | 54/209 [48:01<2:13:34, 51.71s/it, training_loss=0.

Epoch 2:  46%|███▏   | 97/209 [2:06:15<6:05:02, 195.56s/it, training_loss=0.021][A
Epoch 2:  46%|███▏   | 97/209 [2:07:05<6:05:02, 195.56s/it, training_loss=0.112][A
Epoch 2:  47%|███▎   | 98/209 [2:07:05<4:40:56, 151.86s/it, training_loss=0.112][A
Epoch 2:  47%|███▎   | 98/209 [2:07:53<4:40:56, 151.86s/it, training_loss=0.034][A
Epoch 2:  47%|███▎   | 99/209 [2:07:53<3:41:22, 120.75s/it, training_loss=0.034][A
Epoch 2:  47%|███▎   | 99/209 [2:08:43<3:41:22, 120.75s/it, training_loss=0.097][A
Epoch 2:  48%|███▎   | 100/209 [2:08:43<3:00:23, 99.30s/it, training_loss=0.097][A
Epoch 2:  48%|███▎   | 100/209 [2:09:32<3:00:23, 99.30s/it, training_loss=0.099][A
Epoch 2:  48%|███▍   | 101/209 [2:09:32<2:31:47, 84.33s/it, training_loss=0.099][A
Epoch 2:  48%|███▍   | 101/209 [2:10:21<2:31:47, 84.33s/it, training_loss=0.081][A
Epoch 2:  49%|███▍   | 102/209 [2:10:21<2:11:25, 73.70s/it, training_loss=0.081][A
Epoch 2:  49%|███▍   | 102/209 [2:11:10<2:11:25, 73.70s/it, training_loss=0.

Epoch 2:  69%|██████▏  | 145/209 [2:47:46<54:52, 51.45s/it, training_loss=0.040][A
Epoch 2:  70%|██████▎  | 146/209 [2:47:46<54:49, 52.22s/it, training_loss=0.040][A
Epoch 2:  70%|██████▎  | 146/209 [2:48:41<54:49, 52.22s/it, training_loss=0.043][A
Epoch 2:  70%|██████▎  | 147/209 [2:48:41<54:41, 52.93s/it, training_loss=0.043][A
Epoch 2:  70%|██████▎  | 147/209 [2:49:36<54:41, 52.93s/it, training_loss=0.048][A
Epoch 2:  71%|██████▎  | 148/209 [2:49:36<54:29, 53.60s/it, training_loss=0.048][A
Epoch 2:  71%|██████▎  | 148/209 [2:50:29<54:29, 53.60s/it, training_loss=0.075][A
Epoch 2:  71%|██████▍  | 149/209 [2:50:29<53:29, 53.50s/it, training_loss=0.075][A
Epoch 2:  71%|██████▍  | 149/209 [2:51:18<53:29, 53.50s/it, training_loss=0.125][A
Epoch 2:  72%|██████▍  | 150/209 [2:51:18<51:17, 52.16s/it, training_loss=0.125][A
Epoch 2:  72%|██████▍  | 150/209 [2:52:08<51:17, 52.16s/it, training_loss=0.068][A
Epoch 2:  72%|██████▌  | 151/209 [2:52:08<49:36, 51.31s/it, training_loss=0.

Epoch 2:  93%|████████▎| 194/209 [3:45:48<12:28, 49.87s/it, training_loss=0.055][A
Epoch 2:  93%|████████▎| 194/209 [3:46:36<12:28, 49.87s/it, training_loss=0.035][A
Epoch 2:  93%|████████▍| 195/209 [3:46:36<11:31, 49.40s/it, training_loss=0.035][A
Epoch 2:  93%|████████▍| 195/209 [3:47:26<11:31, 49.40s/it, training_loss=0.070][A
Epoch 2:  94%|████████▍| 196/209 [3:47:26<10:42, 49.43s/it, training_loss=0.070][A
Epoch 2:  94%|████████▍| 196/209 [3:48:15<10:42, 49.43s/it, training_loss=0.068][A
Epoch 2:  94%|████████▍| 197/209 [3:48:15<09:52, 49.39s/it, training_loss=0.068][A
Epoch 2:  94%|████████▍| 197/209 [3:49:09<09:52, 49.39s/it, training_loss=0.081][A
Epoch 2:  95%|████████▌| 198/209 [3:49:09<09:18, 50.81s/it, training_loss=0.081][A
Epoch 2:  95%|████████▌| 198/209 [3:50:04<09:18, 50.81s/it, training_loss=0.076][A
Epoch 2:  95%|████████▌| 199/209 [3:50:04<08:39, 51.93s/it, training_loss=0.076][A
Epoch 2:  95%|████████▌| 199/209 [3:50:57<08:39, 51.93s/it, training_loss=0.


Epoch 2
Training loss: 0.2120625183307098



  0%|                                                    | 0/53 [00:00<?, ?it/s][A
  2%|▊                                           | 1/53 [00:07<06:34,  7.59s/it][A
  4%|█▋                                          | 2/53 [00:15<06:25,  7.56s/it][A
  6%|██▍                                         | 3/53 [00:22<06:19,  7.59s/it][A
  8%|███▎                                        | 4/53 [00:30<06:11,  7.58s/it][A
  9%|████▏                                       | 5/53 [00:37<06:04,  7.58s/it][A
 11%|████▉                                       | 6/53 [00:45<05:56,  7.58s/it][A
 13%|█████▊                                      | 7/53 [00:53<05:48,  7.57s/it][A
 15%|██████▋                                     | 8/53 [01:00<05:41,  7.59s/it][A
 17%|███████▍                                    | 9/53 [01:08<05:34,  7.61s/it][A
 19%|████████                                   | 10/53 [01:15<05:27,  7.61s/it][A
 21%|████████▉                                  | 11/53 [01:23<05:19,  7.61

Validation loss: 0.21118896328053385
F1 Score (Weighted): 0.9178040137386667



Epoch 3:   0%|                                          | 0/209 [00:00<?, ?it/s][A
Epoch 3:   0%|                     | 0/209 [00:50<?, ?it/s, training_loss=0.086][A
Epoch 3:   0%|           | 1/209 [00:50<2:55:22, 50.59s/it, training_loss=0.086][A
Epoch 3:   0%|           | 1/209 [01:39<2:55:22, 50.59s/it, training_loss=0.044][A
Epoch 3:   1%|           | 2/209 [01:39<2:50:59, 49.56s/it, training_loss=0.044][A
Epoch 3:   1%|           | 2/209 [02:33<2:50:59, 49.56s/it, training_loss=0.039][A
Epoch 3:   1%|▏          | 3/209 [02:33<2:57:02, 51.57s/it, training_loss=0.039][A
Epoch 3:   1%|▏          | 3/209 [03:28<2:57:02, 51.57s/it, training_loss=0.053][A
Epoch 3:   2%|▏          | 4/209 [03:28<3:01:12, 53.04s/it, training_loss=0.053][A
Epoch 3:   2%|▏          | 4/209 [04:22<3:01:12, 53.04s/it, training_loss=0.092][A
Epoch 3:   2%|▎          | 5/209 [04:22<3:01:38, 53.42s/it, training_loss=0.092][A
Epoch 3:   2%|▎          | 5/209 [05:16<3:01:38, 53.42s/it, training_loss=0

Epoch 3:  23%|██▎       | 48/209 [58:35<2:16:33, 50.89s/it, training_loss=0.017][A
Epoch 3:  23%|██▎       | 49/209 [58:35<2:12:55, 49.85s/it, training_loss=0.017][A
Epoch 3:  23%|██▎       | 49/209 [59:23<2:12:55, 49.85s/it, training_loss=0.104][A
Epoch 3:  24%|██▍       | 50/209 [59:23<2:11:08, 49.49s/it, training_loss=0.104][A
Epoch 3:  24%|█▉      | 50/209 [1:00:11<2:11:08, 49.49s/it, training_loss=0.044][A
Epoch 3:  24%|█▉      | 51/209 [1:00:11<2:08:59, 48.99s/it, training_loss=0.044][A
Epoch 3:  24%|█▉      | 51/209 [1:01:01<2:08:59, 48.99s/it, training_loss=0.068][A
Epoch 3:  25%|█▉      | 52/209 [1:01:01<2:08:51, 49.25s/it, training_loss=0.068][A
Epoch 3:  25%|█▉      | 52/209 [1:01:51<2:08:51, 49.25s/it, training_loss=0.025][A
Epoch 3:  25%|██      | 53/209 [1:01:51<2:08:18, 49.35s/it, training_loss=0.025][A
Epoch 3:  25%|██      | 53/209 [1:02:40<2:08:18, 49.35s/it, training_loss=0.033][A
Epoch 3:  26%|██      | 54/209 [1:02:40<2:07:27, 49.34s/it, training_loss=0.

Epoch 3:  46%|███▋    | 97/209 [1:49:49<1:41:44, 54.50s/it, training_loss=0.044][A
Epoch 3:  46%|███▋    | 97/209 [1:50:38<1:41:44, 54.50s/it, training_loss=0.060][A
Epoch 3:  47%|███▊    | 98/209 [1:50:38<1:38:05, 53.02s/it, training_loss=0.060][A
Epoch 3:  47%|███▊    | 98/209 [1:51:29<1:38:05, 53.02s/it, training_loss=0.073][A
Epoch 3:  47%|███▊    | 99/209 [1:51:29<1:35:32, 52.12s/it, training_loss=0.073][A
Epoch 3:  47%|███▊    | 99/209 [1:52:18<1:35:32, 52.12s/it, training_loss=0.024][A
Epoch 3:  48%|███▎   | 100/209 [1:52:18<1:33:18, 51.36s/it, training_loss=0.024][A
Epoch 3:  48%|███▎   | 100/209 [1:53:08<1:33:18, 51.36s/it, training_loss=0.094][A
Epoch 3:  48%|███▍   | 101/209 [1:53:08<1:31:43, 50.96s/it, training_loss=0.094][A
Epoch 3:  48%|███▍   | 101/209 [1:53:58<1:31:43, 50.96s/it, training_loss=0.106][A
Epoch 3:  49%|███▍   | 102/209 [1:53:58<1:30:21, 50.66s/it, training_loss=0.106][A
Epoch 3:  49%|███▍   | 102/209 [1:55:16<1:30:21, 50.66s/it, training_loss=0.

Epoch 3:  69%|██████▏  | 145/209 [2:36:14<56:56, 53.39s/it, training_loss=0.107][A
Epoch 3:  70%|██████▎  | 146/209 [2:36:14<54:46, 52.16s/it, training_loss=0.107][A
Epoch 3:  70%|██████▎  | 146/209 [2:37:04<54:46, 52.16s/it, training_loss=0.043][A
Epoch 3:  70%|██████▎  | 147/209 [2:37:04<53:15, 51.54s/it, training_loss=0.043][A
Epoch 3:  70%|██████▎  | 147/209 [2:37:54<53:15, 51.54s/it, training_loss=0.052][A
Epoch 3:  71%|██████▎  | 148/209 [2:37:54<51:53, 51.04s/it, training_loss=0.052][A
Epoch 3:  71%|██████▎  | 148/209 [2:38:45<51:53, 51.04s/it, training_loss=0.019][A
Epoch 3:  71%|██████▍  | 149/209 [2:38:45<51:02, 51.04s/it, training_loss=0.019][A
Epoch 3:  71%|██████▍  | 149/209 [2:39:35<51:02, 51.04s/it, training_loss=0.037][A
Epoch 3:  72%|██████▍  | 150/209 [2:39:35<49:47, 50.64s/it, training_loss=0.037][A
Epoch 3:  72%|██████▍  | 150/209 [2:40:24<49:47, 50.64s/it, training_loss=0.058][A
Epoch 3:  72%|██████▌  | 151/209 [2:40:24<48:31, 50.21s/it, training_loss=0.

Epoch 3:  93%|████████▎| 194/209 [3:18:29<13:19, 53.28s/it, training_loss=0.045][A
Epoch 3:  93%|████████▎| 194/209 [3:19:22<13:19, 53.28s/it, training_loss=0.038][A
Epoch 3:  93%|████████▍| 195/209 [3:19:22<12:26, 53.34s/it, training_loss=0.038][A
Epoch 3:  93%|████████▍| 195/209 [3:20:17<12:26, 53.34s/it, training_loss=0.020][A
Epoch 3:  94%|████████▍| 196/209 [3:20:17<11:39, 53.83s/it, training_loss=0.020][A
Epoch 3:  94%|████████▍| 196/209 [3:21:09<11:39, 53.83s/it, training_loss=0.064][A
Epoch 3:  94%|████████▍| 197/209 [3:21:09<10:38, 53.21s/it, training_loss=0.064][A
Epoch 3:  94%|████████▍| 197/209 [3:21:59<10:38, 53.21s/it, training_loss=0.038][A
Epoch 3:  95%|████████▌| 198/209 [3:21:59<09:33, 52.14s/it, training_loss=0.038][A
Epoch 3:  95%|████████▌| 198/209 [3:22:48<09:33, 52.14s/it, training_loss=0.048][A
Epoch 3:  95%|████████▌| 199/209 [3:22:48<08:32, 51.24s/it, training_loss=0.048][A
Epoch 3:  95%|████████▌| 199/209 [3:23:38<08:32, 51.24s/it, training_loss=0.


Epoch 3
Training loss: 0.16193805759745922



  0%|                                                    | 0/53 [00:00<?, ?it/s][A
  2%|▊                                           | 1/53 [00:07<06:29,  7.50s/it][A
  4%|█▋                                          | 2/53 [00:15<06:25,  7.55s/it][A
  6%|██▍                                         | 3/53 [00:22<06:23,  7.67s/it][A
  8%|███▎                                        | 4/53 [00:30<06:20,  7.78s/it][A
  9%|████▏                                       | 5/53 [00:38<06:13,  7.78s/it][A
 11%|████▉                                       | 6/53 [00:46<06:03,  7.73s/it][A
 13%|█████▊                                      | 7/53 [00:53<05:54,  7.72s/it][A
 15%|██████▋                                     | 8/53 [01:01<05:46,  7.70s/it][A
 17%|███████▍                                    | 9/53 [01:09<05:38,  7.69s/it][A
 19%|████████                                   | 10/53 [01:16<05:30,  7.69s/it][A
 21%|████████▉                                  | 11/53 [01:24<05:22,  7.68

Validation loss: 0.2106118011727648
F1 Score (Weighted): 0.9226037793597665





In [36]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [37]:
model.load_state_dict(
    torch.load(
        "finetuned_BERT_epoch_1.model", 
        map_location 2= torch.device('cpu')
    )
)

<All keys matched successfully>

In [38]:
_, predictions, true_vals = evaluate(dataloader_validation)

100%|███████████████████████████████████████████| 53/53 [07:02<00:00,  7.98s/it]


In [39]:
model.load_state_dict(
    torch.load(
        "finetuned_BERT_epoch_3.model", 
        map_location = torch.device('cpu')
    )
)

<All keys matched successfully>

In [40]:
_, predictions, true_vals = evaluate(dataloader_validation)

100%|███████████████████████████████████████████| 53/53 [06:59<00:00,  7.91s/it]


In [41]:
accuracy_per_class(predictions, true_vals)

Class: not_hate
Accuracy: 779/833

Class: hate
Accuracy: 759/834



In [42]:
accuracy_per_class(predictions, true_vals)

Class: not_hate
Accuracy: 779/833

Class: hate
Accuracy: 759/834

