In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re



In [3]:
# Load data into DataFrame 
df = pd.read_csv('balanced_data_combined.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,class
0,0,Drasko they didn't cook half a bird you idiot ...,1
1,1,Hopefully someone cooks Drasko in the next ep ...,1
2,2,of course you were born in serbia...you're as ...,1
3,3,These girls are the equivalent of the irritati...,1
4,4,RT @YesYoureRacist: At least you're only a tin...,1


In [4]:
df['class'].unique()

array([1, 0])

In [5]:
df1 = df.rename(columns={'text': 'tweet', 'class': 'target'})
df1.head()

Unnamed: 0.1,Unnamed: 0,tweet,target
0,0,Drasko they didn't cook half a bird you idiot ...,1
1,1,Hopefully someone cooks Drasko in the next ep ...,1
2,2,of course you were born in serbia...you're as ...,1
3,3,These girls are the equivalent of the irritati...,1
4,4,RT @YesYoureRacist: At least you're only a tin...,1


In [6]:
df1= df1.drop('Unnamed: 0', axis=1)
df1.dropna(subset=['tweet'], inplace=True)

# Clean tweets

In [7]:
def preprocess_tweet(df, col):
    
    df[col] = df[col].apply(lambda x: re.sub(r'@[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'&[\S]+?;', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'#', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'(\bRT\b|\bQT\b)', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'http[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'[^\w\s]', r'', str(x)))
    df[col] = df[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df[col] = df[col].apply(lambda x: re.sub(r'\w*\d\w*', r' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'\s\s+', ' ', str(x)))

In [8]:
preprocess_tweet(df1, 'tweet')
df1.head()

Unnamed: 0,tweet,target
0,drasko they didnt cook half a bird you idiot mkr,1
1,hopefully someone cooks drasko in the next ep ...,1
2,of course you were born in serbiayoure as fuck...,1
3,these girls are the equivalent of the irritati...,1
4,at least youre only a tiny bit racist im not r...,1


In [10]:
import string
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
#nltk.download('punkt')

In [11]:
def tokenize(df, col):
    """
        Function to tokenize column of strings without punctuation
        Input into word_tokenize() must be string with spaces only
        Output is a list of tokenized words
    """
    tokens = pd.DataFrame(df[col].apply(lambda x:nltk.word_tokenize(x)))
    return tokens
    
        
    
 

In [12]:
def no_stopwords(text):
    """This function returns the words in text if they are not a stop words 
       as per nltk.corpus.stopwords
    """
    lst = [word for word in text if word not in stop_words]
    return lst

In [13]:
#df1["tokenized_tweet"] = df1.tweet.apply(lambda x: tokenize(df1, 'tweet'))
stop_words = set(stopwords.words('english'))
stop_list = [''.join(c for c in s if c not in string.punctuation) for s in stop_words]

token_tweets = tokenize(df1,'tweet')

In [14]:
type(token_tweets)

pandas.core.frame.DataFrame

In [15]:
token_tweets.head()

Unnamed: 0,tweet
0,"[drasko, they, didnt, cook, half, a, bird, you..."
1,"[hopefully, someone, cooks, drasko, in, the, n..."
2,"[of, course, you, were, born, in, serbiayoure,..."
3,"[these, girls, are, the, equivalent, of, the, ..."
4,"[at, least, youre, only, a, tiny, bit, racist,..."


In [16]:
token_tweets = token_tweets.rename(columns={"tweet": "tweet_tokens"})

In [17]:
df2= pd.concat([df1, token_tweets], axis=1)
df2.head()

Unnamed: 0,tweet,target,tweet_tokens
0,drasko they didnt cook half a bird you idiot mkr,1,"[drasko, they, didnt, cook, half, a, bird, you..."
1,hopefully someone cooks drasko in the next ep ...,1,"[hopefully, someone, cooks, drasko, in, the, n..."
2,of course you were born in serbiayoure as fuck...,1,"[of, course, you, were, born, in, serbiayoure,..."
3,these girls are the equivalent of the irritati...,1,"[these, girls, are, the, equivalent, of, the, ..."
4,at least youre only a tiny bit racist im not r...,1,"[at, least, youre, only, a, tiny, bit, racist,..."


In [18]:
df2['tweet_tokens']=df2['tweet_tokens'].apply(lambda x: no_stopwords(x))

In [19]:
df2.head()

Unnamed: 0,tweet,target,tweet_tokens
0,drasko they didnt cook half a bird you idiot mkr,1,"[drasko, didnt, cook, half, bird, idiot, mkr]"
1,hopefully someone cooks drasko in the next ep ...,1,"[hopefully, someone, cooks, drasko, next, ep, ..."
2,of course you were born in serbiayoure as fuck...,1,"[course, born, serbiayoure, fucked, serbian, f..."
3,these girls are the equivalent of the irritati...,1,"[girls, equivalent, irritating, asian, girls, ..."
4,at least youre only a tiny bit racist im not r...,1,"[least, youre, tiny, bit, racist, im, racist, ..."


# Bert tokenizer 

In [20]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

In [21]:
df2["sent_bert_token_length"] = df2["tweet"].apply(lambda x: len(tokenizer(x, add_special_tokens=False)["input_ids"]))

In [22]:
df2.head()

Unnamed: 0,tweet,target,tweet_tokens,sent_bert_token_length
0,drasko they didnt cook half a bird you idiot mkr,1,"[drasko, didnt, cook, half, bird, idiot, mkr]",14
1,hopefully someone cooks drasko in the next ep ...,1,"[hopefully, someone, cooks, drasko, next, ep, ...",13
2,of course you were born in serbiayoure as fuck...,1,"[course, born, serbiayoure, fucked, serbian, f...",17
3,these girls are the equivalent of the irritati...,1,"[girls, equivalent, irritating, asian, girls, ...",18
4,at least youre only a tiny bit racist im not r...,1,"[least, youre, tiny, bit, racist, im, racist, ...",16


In [23]:
len(df2['tweet_tokens'][0])

7

In [24]:
df2['tweet_tokens'][0]

['drasko', 'didnt', 'cook', 'half', 'bird', 'idiot', 'mkr']

# Bert Stuff

In [25]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import json

In [26]:
class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5 
    batch_size = 6
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True 
    return_attention_mask=True 
    pad_to_max_length=True 
    do_lower_case=False
    return_tensors='pt'

config = Config()

In [27]:
params = {"seed_val": config.seed_val,
    "device":str(config.device),
    "epochs":config.epochs, 
    "batch_size":config.batch_size,
    "seq_length":config.seq_length,
    "lr":config.lr,
    "eps":config.eps,
    "pretrained_model": config.pretrained_model,
    "test_size":config.test_size,
    "random_state":config.random_state,
    "add_special_tokens":config.add_special_tokens,
    "return_attention_mask":config.return_attention_mask,
    "pad_to_max_length":config.pad_to_max_length,
    "do_lower_case":config.do_lower_case,
    "return_tensors":config.return_tensors,
         }

In [28]:
import random

device = config.device

random.seed(config.seed_val)
np.random.seed(config.seed_val)
torch.manual_seed(config.seed_val)
torch.cuda.manual_seed_all(config.seed_val)

In [29]:
#split train test
from sklearn.model_selection import train_test_split

train_df_, val_df = train_test_split(df2, 
                                    test_size=0.10, 
                                    random_state=config.random_state, stratify=df2['target'].values)

In [30]:
train_df_.head()

Unnamed: 0,tweet,target,tweet_tokens,sent_bert_token_length
1655,call me sexist but i dont think i wanna see a ...,1,"[call, sexist, dont, think, wan, na, see, wome...",21
5286,i cant watch charlie st cloud because i cry li...,0,"[cant, watch, charlie, st, cloud, cry, like, l...",17
1050,feminism,1,[feminism],1
8046,you a hoe if you crying because yo ass pregnan...,1,"[hoe, crying, yo, ass, pregnant, smiles, ridin...",22
7217,if jeremy lin dunked on lebron in a yankees ca...,0,"[jeremy, lin, dunked, lebron, yankees, cap, ce...",29


In [31]:
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model, 
                                          do_lower_case=config.do_lower_case)

In [32]:
encoded_data_train = tokenizer.batch_encode_plus(
    train_df_['tweet'].values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length, 
    max_length=config.seq_length, 
    return_tensors=config.return_tensors
)
encoded_data_val = tokenizer.batch_encode_plus(
    val_df['tweet'].values, 
    add_special_tokens=config.add_special_tokens, 
    return_attention_mask=config.return_attention_mask, 
    pad_to_max_length=config.pad_to_max_length,
    max_length=config.seq_length, 
    return_tensors=config.return_tensors
)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [34]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df_['target'].values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_df['target'].values)

In [35]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [36]:
len(dataset_train)

7501

In [37]:
len(dataset_val)

834

In [39]:
label_dict ={'hate': 1, 'not_hate':0}

In [40]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = len(label_dict),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [42]:
batch_size = 32

# We Need two different dataloder
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                              sampler=RandomSampler(dataset_val),
                              batch_size=batch_size)

In [43]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [44]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps = 1e-8
)



In [45]:
epochs = 3

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps = 0,
    num_training_steps = len(dataloader_train)*epochs
)

In [46]:
from sklearn.metrics import f1_score

In [47]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [48]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis = 1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds == label])}/{len(y_true)}\n')

In [49]:
print(config.device)

cpu


In [50]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [51]:
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()          # Sending our model in Training mode
    
    loss_train_total = 0   # Setting the training loss to zero initially

    # Setting up the Progress bar to Moniter the progress of training
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad() # As we not working with thew RNN's
        
        # As our dataloader has '3' iteams so batches will be the Tuple of '3'
        batch = tuple(b.to(device) for b in batch)
        
        # INPUTS
        # Pulling out the inputs in the form of dictionary
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        # OUTPUTS
        outputs = model(**inputs) # '**' Unpacking the dictionary stright into the input
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()           # backpropagation

        # Gradient Clipping -- Taking the Grad. & gives it a NORM value ~ 1 
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
         
        
    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')
        
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/235 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.40034622274180676


  0%|          | 0/27 [00:00<?, ?it/s]

Validation loss: 0.25415504861761024
F1 Score (Weighted): 0.9002833530446407


Epoch 2:   0%|          | 0/235 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.19893070310354233


  0%|          | 0/27 [00:00<?, ?it/s]

Validation loss: 0.20397831527171312
F1 Score (Weighted): 0.9316380662853923


Epoch 3:   0%|          | 0/235 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.14719439873352963


  0%|          | 0/27 [00:00<?, ?it/s]

Validation loss: 0.20411062406169045
F1 Score (Weighted): 0.9316262673188861


In [52]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [59]:
model.load_state_dict(
    torch.load(
        "finetuned_BERT_epoch_1.model", 
        map_location = torch.device('cpu')
    )
)

<All keys matched successfully>

In [60]:
_, predictions, true_vals = evaluate(dataloader_validation)

  0%|          | 0/27 [00:00<?, ?it/s]

Class: not_hate
Accuracy: 394/417

Class: hate
Accuracy: 357/417



In [1]:
model.load_state_dict(
    torch.load(
        "finetuned_BERT_epoch_3.model", 
        map_location = torch.device('cpu')
    )
)

NameError: name 'model' is not defined

In [63]:
_, predictions, true_vals = evaluate(dataloader_validation)

  0%|          | 0/27 [00:00<?, ?it/s]

In [64]:
accuracy_per_class(predictions, true_vals)

Class: not_hate
Accuracy: 397/417

Class: hate
Accuracy: 380/417



In [None]:
accuracy_per_class(predictions, true_vals)