# Fine-tune BERT for Offensive Language Detect

## 1.  Class Distribution
Load the training set `(olid-train.csv)` and analyze the number of instances for each of the two classification labels.

In [72]:
import pandas as pd
import matplotlib.pyplot as plt
from random import randint
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import random
import spacy

In [3]:
data = pd.read_csv("./data/olid-train.csv")
data

Unnamed: 0,id,text,labels
0,86426,@USER She should ask a few native Americans wh...,1
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,1
2,16820,Amazon is investigating Chinese employees who ...,0
3,62688,"@USER Someone should'veTaken"" this piece of sh...",1
4,43605,@USER @USER Obama wanted liberals &amp; illega...,0
...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,1
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,0
13237,82921,@USER And why report this garbage. We don't g...,1
13238,27429,@USER Pussy,1


In [4]:
data['labels'].unique()

array([1, 0], dtype=int64)

## Number of Instances

In [5]:
label_zero = data[(data["labels"] == 0)]
number_of_instances_0 = len(label_zero)

label_one = data[(data["labels"] == 1)]
number_of_instances_1 = len(label_one)

print("Class 0:", number_of_instances_0)
print("Class 1:", number_of_instances_1)

Class 0: 8840
Class 1: 4400


## Relative Label Frequency (%)

In [6]:
total = len(data)
freq_zero = (number_of_instances_0 / total)
freq_one = (number_of_instances_1 / total)

print("0 Label Frequency:", freq_zero)
print("1 Label Frequency:", freq_one)

0 Label Frequency: 0.6676737160120846
1 Label Frequency: 0.3323262839879154


## Example Tweet with This Label

In [7]:
random_0 = randint(0, number_of_instances_0 - 1)
data["text"][(data["labels"] == 0)][4]

'@USER @USER Obama wanted liberals &amp; illegals to move into red states'

In [8]:
random_1 = randint(0, number_of_instances_1 - 1)
data["text"][(data["labels"] == 1)][0]

'@USER She should ask a few native Americans what their take on this is.'

## 2. Baselines

Calculate two baselines and evaluate their performance on the test set (olid-test.csv): <br>

- The first baseline is a random baseline that randomly assigns one of the 2 classification 
labels. <br>
- The second baseline is a majority baseline that always assigns the majority class. <br>

Calculate the results on the test set and fill them into the two tables below. Round the results to 
two decimals.

In [9]:
test_data = pd.read_csv("./data/olid-test.csv")
test_data.head()

Unnamed: 0,id,text,labels
0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,1
1,27014,"#ConstitutionDay is revered by Conservatives, ...",0
2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,0
3,13876,#Watching #Boomer getting the news that she is...,0
4,60133,#NoPasaran: Unity demo to oppose the far-right...,1


In [10]:
print(test_data.shape)
print(data.shape)

(860, 3)
(13240, 3)


## Random baseline

In [95]:
def random_baseline(train_data, test_data_text,test_data_labels):
    possible_labels = [0, 1]

    predictions = []
    for instance in test_data_text:
        instance_predictions = [random.choice(possible_labels)]
        predictions.append(instance_predictions)
        
    
    predictions = [item for sublist in predictions for item in sublist]
        
    test_labels_list = []
    for element in test_data_labels:
        test_labels_list.append(element)
                
    
    prediction_df = pd.DataFrame({"predictions":predictions})

    accuracy = accuracy_score(test_labels_list,predictions)
    print("Accuracy: ",accuracy)
    
    print(classification_report(test_labels_list, predictions,zero_division=1))

In [96]:
random_baseline(train_data=data['text'], test_data_text = test_data['text'],test_data_labels =test_data['labels'])

Accuracy:  0.48604651162790696
              precision    recall  f1-score   support

           0       0.71      0.49      0.58       620
           1       0.26      0.47      0.34       240

    accuracy                           0.49       860
   macro avg       0.49      0.48      0.46       860
weighted avg       0.58      0.49      0.51       860



## Majority baseline

In [97]:
def majority_baseline(train_data_labels, test_data_text,test_data_labels):
    train_labels_list = []

    for element in train_data_labels:
        train_labels_list.append(element)
        
    label_one_occurences = train_labels_list.count(1)
    label_zero_occurences = train_labels_list.count(0)
    
    if label_one_occurences > label_zero_occurences:
        majority_baseline_class = 1
    else:
        majority_baseline_class = 0
        
        
    predictions = []
    test_labels_list = []
    for text in test_data_text:
        predictions.append(majority_baseline_class)

    test_labels_list = []
    for element in test_data_labels:
        test_labels_list.append(element)
        
    final_df = pd.DataFrame({"predictions":predictions})
    
    accuracy = accuracy_score(test_labels_list,predictions)
    print("Accuracy: ",accuracy)
    print(classification_report(test_labels_list, predictions,zero_division=1))

In [98]:
majority_baseline(train_data_labels=data['text'], test_data_text = test_data['text'],test_data_labels =test_data['labels'])

Accuracy:  0.7209302325581395
              precision    recall  f1-score   support

           0       0.72      1.00      0.84       620
           1       1.00      0.00      0.00       240

    accuracy                           0.72       860
   macro avg       0.86      0.50      0.42       860
weighted avg       0.80      0.72      0.60       860

