In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

In [2]:
import string
import nltk
from nltk.corpus import stopwords

from transformers import BertTokenizer

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer


from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import json
import random

from sklearn.metrics import f1_score

In [3]:
# Load data into DataFrame 
df = pd.read_csv('labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [4]:
df['class'] = df['class'].replace(2,1)
df['class'] = df['class'].replace([0,1],[1,0])
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,0,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,0,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,0,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,0,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [6]:
df = df.drop(['Unnamed: 0', 'count','hate_speech','offensive_language', 'neither'], axis = 'columns' )
df.head()

Unnamed: 0,class,tweet
0,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,0,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,0,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,0,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,0,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
df = df.rename(columns={ 'class': 'target'})

In [8]:
df.head()

Unnamed: 0,target,tweet
0,0,!!! RT @mayasolovely: As a woman you shouldn't...
1,0,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,0,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,0,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,0,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [9]:
def preprocess_tweet(df, col):
    
    df[col] = df[col].apply(lambda x: re.sub(r'@[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'&[\S]+?;', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'#', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'(\bRT\b|\bQT\b)', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'http[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'[^\w\s]', r'', str(x)))
    df[col] = df[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df[col] = df[col].apply(lambda x: re.sub(r'\w*\d\w*', r' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'\s\s+', ' ', str(x)))

In [10]:
preprocess_tweet(df, 'tweet')
df.head()

Unnamed: 0,target,tweet
0,0,as a woman you shouldnt complain about cleanin...
1,0,boy dats coldtyga dwn bad for cuffin dat hoe i...
2,0,dawg you ever fuck a bitch and she start to cr...
3,0,she look like a tranny
4,0,the shit you hear about me might be true or it...


In [11]:
def tokenize(df, col):
    """
        Function to tokenize column of strings without punctuation
        Input into word_tokenize() must be string with spaces only
        Output is a list of tokenized words
    """
    tokens = pd.DataFrame(df[col].apply(lambda x:nltk.word_tokenize(x)))
    return tokens
    
def no_stopwords(text):
    """This function returns the words in text if they are not a stop words 
       as per nltk.corpus.stopwords
    """
    lst = [word for word in text if word not in stop_words]
    return lst

In [12]:
stop_words = set(stopwords.words('english'))
stop_list = [''.join(c for c in s if c not in string.punctuation) for s in stop_words]

token_tweets = tokenize(df,'tweet')

In [13]:
token_tweets.head()

Unnamed: 0,tweet
0,"[as, a, woman, you, shouldnt, complain, about,..."
1,"[boy, dats, coldtyga, dwn, bad, for, cuffin, d..."
2,"[dawg, you, ever, fuck, a, bitch, and, she, st..."
3,"[she, look, like, a, tranny]"
4,"[the, shit, you, hear, about, me, might, be, t..."


In [14]:
token_tweets = token_tweets.rename(columns={"tweet": "tweet_tokens"})

In [15]:
df1= pd.concat([df, token_tweets], axis=1)
df1.head()

Unnamed: 0,target,tweet,tweet_tokens
0,0,as a woman you shouldnt complain about cleanin...,"[as, a, woman, you, shouldnt, complain, about,..."
1,0,boy dats coldtyga dwn bad for cuffin dat hoe i...,"[boy, dats, coldtyga, dwn, bad, for, cuffin, d..."
2,0,dawg you ever fuck a bitch and she start to cr...,"[dawg, you, ever, fuck, a, bitch, and, she, st..."
3,0,she look like a tranny,"[she, look, like, a, tranny]"
4,0,the shit you hear about me might be true or it...,"[the, shit, you, hear, about, me, might, be, t..."


In [16]:
df1['tweet_tokens']=df1['tweet_tokens'].apply(lambda x: no_stopwords(x))

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [18]:
df1["sent_bert_token_length"] = df1["tweet"].apply(lambda x: len(tokenizer(x, add_special_tokens=False)["input_ids"]))
df1.head()

Unnamed: 0,target,tweet,tweet_tokens,sent_bert_token_length
0,0,as a woman you shouldnt complain about cleanin...,"[woman, shouldnt, complain, cleaning, house, m...",22
1,0,boy dats coldtyga dwn bad for cuffin dat hoe i...,"[boy, dats, coldtyga, dwn, bad, cuffin, dat, h...",18
2,0,dawg you ever fuck a bitch and she start to cr...,"[dawg, ever, fuck, bitch, start, cry, confused...",17
3,0,she look like a tranny,"[look, like, tranny]",6
4,0,the shit you hear about me might be true or it...,"[shit, hear, might, true, might, faker, bitch,...",23


In [19]:
class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5 
    batch_size = 6
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True 
    return_attention_mask=True 
    pad_to_max_length=True 
    do_lower_case=False
    return_tensors='pt'

config = Config()

In [20]:
params = {"seed_val": config.seed_val,
    "device":str(config.device),
    "epochs":config.epochs, 
    "batch_size":config.batch_size,
    "seq_length":config.seq_length,
    "lr":config.lr,
    "eps":config.eps,
    "pretrained_model": config.pretrained_model,
    "test_size":config.test_size,
    "random_state":config.random_state,
    "add_special_tokens":config.add_special_tokens,
    "return_attention_mask":config.return_attention_mask,
    "pad_to_max_length":config.pad_to_max_length,
    "do_lower_case":config.do_lower_case,
    "return_tensors":config.return_tensors,
         }

In [21]:
device = config.device

random.seed(config.seed_val)
np.random.seed(config.seed_val)
torch.manual_seed(config.seed_val)
torch.cuda.manual_seed_all(config.seed_val)

In [22]:
from sklearn.model_selection import train_test_split

train_df_, val_df = train_test_split(df1, 
                                    test_size=0.20, 
                                    random_state=config.random_state, stratify=df1['target'].values)

In [23]:
train_df_.head()

Unnamed: 0,target,tweet,tweet_tokens,sent_bert_token_length
7571,0,all i want is cheesy bread and brownies,"[want, cheesy, bread, brownies]",11
8047,0,bitch fuk wat a hater gotta say brickk ent ya ...,"[bitch, fuk, wat, hater, got, ta, say, brickk,...",15
5315,0,back off ya queer bag,"[back, ya, queer, bag]",5
20699,0,shantrell tryna fight bitches in da clinic n s...,"[shantrell, tryna, fight, bitches, da, clinic,...",24
18820,0,boys get thirsty and grimey for a girl no puss...,"[boys, get, thirsty, grimey, girl, pussy, gett...",16


In [24]:
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model, 
                                          do_lower_case=config.do_lower_case)

In [25]:
encoded_data_train = tokenizer.batch_encode_plus(
    train_df_['tweet'].values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length, 
    max_length=config.seq_length, 
    return_tensors=config.return_tensors
)
encoded_data_val = tokenizer.batch_encode_plus(
    val_df['tweet'].values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length,
    max_length=config.seq_length, 
    return_tensors=config.return_tensors
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [26]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df_['target'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_df['target'].values)

In [27]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [28]:
len(dataset_train)

19826

In [29]:
len(dataset_val)

4957

In [30]:
label_dict ={'hate': 1, 'neutral':0}

In [31]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
batch_size = 32

# We Need two different dataloder
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                              sampler=RandomSampler(dataset_val),
                              batch_size=batch_size)

In [33]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps = 1e-8
)



In [34]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)

In [35]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [36]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

In [37]:
print(config.device)

cpu


In [38]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [39]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()          # Sending our model in Training mode
    
    loss_train_total = 0   # Setting the training loss to zero initially

    # Setting up the Progress bar to Moniter the progress of training
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad() # As we not working with thew RNN's
        
        # As our dataloader has '3' iteams so batches will be the Tuple of '3'
        batch = tuple(b.to(device) for b in batch)
        
        # INPUTS
        # Pulling out the inputs in the form of dictionary
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        # OUTPUTS
        outputs = model(**inputs) # '**' Unpacking the dictionary stright into the input
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()           # backpropagation

        # Gradient Clipping -- Taking the Grad. & gives it a NORM value ~ 1 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/620 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.18700850727487237


  0%|          | 0/155 [00:00<?, ?it/s]

Validation loss: 0.15255569796408378
F1 Score (Weighted): 0.9333159909143244


Epoch 2:   0%|          | 0/620 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.1422959870118047


  0%|          | 0/155 [00:00<?, ?it/s]

Validation loss: 0.15118884638312363
F1 Score (Weighted): 0.9394823505050859


Epoch 3:   0%|          | 0/620 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.12168487738442398


  0%|          | 0/155 [00:00<?, ?it/s]

Validation loss: 0.15133715338644482
F1 Score (Weighted): 0.9393559222718971


In [40]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [41]:
model.load_state_dict(
    torch.load(
        "finetuned_BERT_epoch_2.model", 
        map_location = torch.device('cpu')
    )
)

<All keys matched successfully>

In [42]:
_, predictions, true_vals = evaluate(dataloader_validation)

  0%|          | 0/155 [00:00<?, ?it/s]

In [43]:
accuracy_per_class(predictions, true_vals)

Class: neutral
Accuracy: 4526/4671

Class: hate
Accuracy: 133/286



In [44]:
model.load_state_dict(
    torch.load(
        "finetuned_BERT_epoch_3.model", 
        map_location = torch.device('cpu')
    )
)

<All keys matched successfully>

In [45]:
_, predictions, true_vals = evaluate(dataloader_validation)

  0%|          | 0/155 [00:00<?, ?it/s]

In [46]:
accuracy_per_class(predictions, true_vals)

Class: neutral
Accuracy: 4565/4671

Class: hate
Accuracy: 110/286

