In [1]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW,BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [7]:
df = pd.read_csv('data/train.tsv', delimiter='\t', header=None, names=['unique_id', 'label', 'extra_label', 'text'])
df

Unnamed: 0,unique_id,label,extra_label,text
0,33395,1,a,JEW Get the fuck out of here you jewish son of...
1,28713,1,a,GET RID OF THOSE FUCKING MUSLIMS! the nasty on...
2,25664,0,a,@Balwant58644969 @gushi22 @Insaniat_parast @ma...
3,28811,1,a,There are not any jew signatures on our Declar...
4,24408,1,a,"Put them in a airplane, and take them back to ..."
...,...,...,...,...
34234,14377,0,a,These regional ministers...this is why our chu...
34235,8035,0,a,"HAPPY BIRTHDAY jaden,enjoy life being gay and ..."
34236,33918,0,a,@BeyLegion This exactly why bey did this song....
34237,21084,1,a,"If I were you, I'd reclaim her body again. Sho..."


In [8]:
sentences = df.text.values

# Adding CLS and SEP tokens at the beginning and end of each sentence for BERT
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

print(tokenized_texts[0])

['[CLS]', 'jew', 'get', 'the', 'fuck', 'out', 'of', 'here', 'you', 'jewish', 'son', 'of', 'a', 'bitch', ',', 'i', "'", 'll', 'rape', 'your', 'fucking', 'family', 'if', 'you', 'don', "'", 't', 'leave', 'you', 'semi', '##te', 'bastard', '.', 'i', 'will', 'shoot', 'you', 'if', 'you', 'return', ',', 'because', 'you', "'", 're', 'a', 'dirty', 'semi', '##te', ',', 'i', 'hope', 'you', 'choke', 'on', 'a', 'fucking', 'bag', '##el', ',', 'prick', '.', 'sha', '##lom', '.', 'we', 'came', 'in', '?', '[SEP]']


In [10]:
MAX_LEN = 128

# Use the BERT Tokenizer to convert the tokens to their index numbers in the BERT vocalubary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens to max length
input_ids = pad_sequences(input_ids,maxlen=MAX_LEN,dtype="long",truncating="post",padding="post")

In [11]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [12]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, 
                                                                                    labels, 
                                                                                    random_state=2018, 
                                                                                    test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, 
                                                       input_ids,
                                                        random_state=2018, 
                                                       test_size=0.1)

In [13]:
#Torch tensors are the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [14]:
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [15]:
# Initializing a BERT bert-base-uncased style configuration
from transformers import BertModel, BertConfig
configuration = BertConfig()

# Initializing a model from the bert-base-uncased style configuration
model = BertModel(configuration)

# Accessing the model configuration
configuration = model.config
print(configuration)

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [17]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
#model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
model.config

BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.34.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [21]:
# Don't apply weight decay to any parameters whose names include these tokens.
# (Here, the BERT doesn't have `gamma` or `beta` parameters, only `bias` terms)
param_optimizer = list(model.named_parameters())

no_decay = ['bias', 'LayerNorm.weight']
# Separate the `weight` parameters from the `bias` parameters. 
# - For the `weight` parameters, this specifies a 'weight_decay_rate' of 0.01. 
# - For the `bias` parameters, the 'weight_decay_rate' is 0.0. 

optimizer_grouped_parameters = [
    # Filter for all parameters which *don't* include 'bias', 'gamma', 'beta'.
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.1},
    
    # Filter for parameters which *do* include those.
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [22]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

optimizer = AdamW(optimizer_grouped_parameters,
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                  )

# Total number of training steps is number of batches * number of epochs.
# `train_dataloader` contains batched data so `len(train_dataloader)` gives 
# us the number of batches.

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)



In [23]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [24]:
#The Training Loop
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

In [25]:
for _ in trange(epochs, desc="Epoch"):

    # Set our model to training mode (as opposed to evaluation mode)
    model.train()

    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs['loss']
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        
        # Update the learning rate.
        scheduler.step()

        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        
    print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

        # Move logits and labels to CPU
        logits = logits['logits'].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

Epoch:   0%|                                                                                                                                   | 0/4 [00:00<?, ?it/s]

Train loss: 0.054611350566538985


Epoch:  25%|█████████████████████████████▎                                                                                       | 1/4 [1:33:00<4:39:02, 5580.98s/it]

Validation Accuracy: 0.9883177570093458
Train loss: 0.009348601092961482


Epoch:  50%|██████████████████████████████████████████████████████████▌                                                          | 2/4 [3:03:14<3:02:44, 5482.48s/it]

Validation Accuracy: 0.9894859813084113
Train loss: 0.0021725765699531587


Epoch:  75%|███████████████████████████████████████████████████████████████████████████████████████▊                             | 3/4 [4:33:13<1:30:44, 5444.56s/it]

Validation Accuracy: 0.991822429906542
Train loss: 0.00037843999737051755


Epoch: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [6:04:08<00:00, 5462.02s/it]

Validation Accuracy: 0.991822429906542





In [30]:
df_test = pd.read_csv('data/test_labels.tsv', delimiter='\t', header=None, 
                      names=['unique_id', 'text', 'label'])
df_test

Unnamed: 0,unique_id,text,label
0,0,JEW Get the fuck out of here you jewish son of...,1
1,1,This is how I too feel about #LGBTQ #Pride It'...,0
2,2,"You're right, I didn't think about that. And w...",0
3,3,I am transgender but I'm also mentally ill but...,0
4,4,trump is the only jews true friend. I hope tru...,1
...,...,...,...
8642,8642,Hitler didn't finish it. Can u. If a nigger ur...,1
8643,8643,"Ah yeah, I'm actually Dominican myself! And mu...",0
8644,8644,"""@iamkrause: No need to thank me, killing nigg...",1
8645,8645,+Kinda Linda yea I know I just wish people wou...,0


In [31]:
sentences = df.text.values

# Adding special tokens at the start and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [32]:
MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

In [33]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 32  

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [34]:
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
    # Add batch to CPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up prediction
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Move logits and labels to CPU
    logits = logits['logits'].detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

In [35]:
from sklearn.metrics import matthews_corrcoef
matthews_set = []

for i in range(len(true_labels)):
    matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
    matthews_set.append(matthews)

In [36]:
flat_predictions = [item for sublist in predictions for item in sublist]
# print(flat_predictions)
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
print(min(flat_predictions))
flat_true_labels = [item for sublist in true_labels for item in sublist]
# print(flat_true_labels)
matthews_corrcoef(flat_true_labels, flat_predictions)

0


0.9983003660816576

In [49]:
sentence = 'This bitch needs to be silent'

sentence = "[CLS] " + sentence + " [SEP]"
tokenized_sentence = [tokenizer.tokenize(sentence)]

MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sentence]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

In [50]:
print(input_ids)

[[ 101 2023 7743 3791 2000 2022 4333  102    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]]


In [51]:
print(attention_masks)

[[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]


In [52]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

In [53]:
model.eval()
logits = model(prediction_inputs, token_type_ids=None, attention_mask=prediction_masks)

In [54]:
logits

SequenceClassifierOutput(loss=None, logits=tensor([[-3.7897,  4.3276]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [55]:
logits['logits'].detach().cpu().numpy()

array([[-3.7897427,  4.3276453]], dtype=float32)

In [45]:
np.argmax(pred, axis=1).flatten()

array([1])

In [86]:
sentence = 'She loves to cook'

sentence = "[CLS] " + sentence + " [SEP]"
tokenized_sentence = [tokenizer.tokenize(sentence)]

MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sentence]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask) 

In [87]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

model.eval()
logits = model(prediction_inputs, token_type_ids=None, attention_mask=prediction_masks)

logits['logits'].detach().cpu().numpy()

array([[ 4.303495, -4.609913]], dtype=float32)

In [88]:
predictions = []
predictions.append(logits['logits'].detach().cpu().numpy())

In [89]:
flat_predictions = [item for sublist in predictions for item in sublist]

In [90]:
flat_predictions

[array([ 4.303495, -4.609913], dtype=float32)]

In [91]:
flat_predictions = np.argmax(flat_predictions)
print(flat_predictions)

0


In [92]:
sentence = 'This bitch needs to be silent'

sentence = "[CLS] " + sentence + " [SEP]"
tokenized_sentence = [tokenizer.tokenize(sentence)]

MAX_LEN = 128
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sentence]
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks.append(seq_mask)

In [93]:
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)

model.eval()
logits = model(prediction_inputs, token_type_ids=None, attention_mask=prediction_masks)

logits['logits'].detach().cpu().numpy()

array([[-3.7897427,  4.3276453]], dtype=float32)

In [94]:
predictions = []
predictions.append(logits['logits'].detach().cpu().numpy())

In [95]:
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions

[array([-3.7897427,  4.3276453], dtype=float32)]

In [96]:
flat_predictions = np.argmax(flat_predictions)
print(flat_predictions)

1


In [97]:
def predict_intesity(sentence):
    sentence = "[CLS] " + sentence + " [SEP]"
    tokenized_sentence = [tokenizer.tokenize(sentence)]

    MAX_LEN = 128
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_sentence]
    input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for padding
    for seq in input_ids:
        seq_mask = [float(i>0) for i in seq]
        attention_masks.append(seq_mask)
        
    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)

    model.eval()
    logits = model(prediction_inputs, token_type_ids=None, attention_mask=prediction_masks)
    predictions = []
    predictions.append(logits['logits'].detach().cpu().numpy())
    
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions)
    
    if flat_predictions == 1:
        return "Hateful"
    
    return "Not Hateful"

In [116]:
print(predict_intesity("She is a gay black bitch"))

Hateful


In [123]:
model.save_pretrained("bert-hate-speech")

In [124]:
from transformers import BertModel, BertConfig
model = BertForSequenceClassification.from_pretrained("bert-hate-speech", num_labels=2)

In [126]:
model.config

BertConfig {
  "_name_or_path": "bert-hate-speech",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.34.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}