In [None]:
%pip install ipywidgets
%pip install pandas
%pip install scikit-learn
%pip install huggingface_hub
%pip install datasets

In [1]:
import pandas as pd
import numpy as np

Load in Dataset Using Huggingface

In [2]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/christophsonntag/OLID/" + splits["train"])

Create Training and Testing Split on Data

In [3]:
tweets = np.array(df['tweet'].values)
labels = np.where(df['subtask_a'].values == 'OFF', 1, 0)

split = len(tweets) * 0.8
train_tweets = tweets[:int(split)]
train_labels = labels[:int(split)]
test_tweets = tweets[int(split):]
test_labels = labels[int(split):]


Create a dictionary of most common words in offensive tweets

In [4]:
offensive_dict = {}

for i in range(len(train_tweets)):
    if train_labels[i] == 1:
        tweet = train_tweets[i]
        for word in tweet.split():
            if word not in offensive_dict:
                offensive_dict[word] = 1
            else:
                offensive_dict[word] += 1

print(offensive_dict)



In [5]:
sorted_offensive_array = sorted(offensive_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_offensive_array[:20])

[('@USER', 7096), ('the', 2274), ('is', 1772), ('to', 1746), ('a', 1690), ('and', 1303), ('of', 1059), ('are', 968), ('you', 963), ('that', 706), ('I', 698), ('in', 682), ('for', 630), ('he', 495), ('with', 436), ('on', 421), ('URL', 419), ('it', 391), ('not', 385), ('have', 384)]


Filter Out Common Everyday Words

In [6]:
stopwords = {'the', 'is', 'to', 'a', 'and', 'of', 'are', 'you', 'that', 'i', 'in', 
             'for', 'he', 'with', 'on', 'it', 'not', 'have', 'be', 'this', 'was', 'as', 'by',
             'at', 'from', 'or', 'an', 'but', 'all', 'they', 'we', 'there', 'if', 'so',
             'about', 'my', 'your', 'just', 'like', 'what', 'more', 'than', 'when', 'who', 'do', 'can'}

feature_l = 75

filtered_offensive_counts = [item[0] for item in sorted_offensive_array if item[0].lower() not in stopwords]
print(filtered_offensive_counts[:feature_l])

['@USER', 'URL', 'she', 'gun', 'control', 'her', 'will', 'She', 'their', 'his', 'people', '&amp;', 'has', 'liberals', 'out', 'shit', 'up', 'no', 'know', 'how', 'because', '#MAGA', 'get', 'think', 'one', 'should', 'me', 'would', 'Trump', 'Liberals', 'them', 'our', "don't", 'him', 'going', 'why', 'some', 'these', 'been', 'don’t', 'need', 'want', 'Antifa', 'being', 'only', 'even', 'believe', 'go', 'ass', 'never', 'any', 'make', 'conservatives', 'other', 'fucking', 'really', 'say', 'were', 'right', 'good', 'see', 'those', 'us', 'still', 'then', 'fuck', 'now', '-', 'had', 'time', 'left', "I'm", 'stupid', 'Why', 'did']


Turn tweets into vectors of offensive word counts.

In [7]:
filtered_word_index = {item: idx for idx, item in enumerate(filtered_offensive_counts)}

train_tweets_vec = []

for tweet in train_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    train_tweets_vec.append(tweet_vec)

test_tweets_vec = []
for tweet in test_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    test_tweets_vec.append(tweet_vec)

Create a Logistic Regression Model to fit to training data and test on testing data

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
model = LogisticRegression(max_iter=1000, random_state=42)

model.fit(train_tweets_vec, train_labels)
predictions = model.predict(test_tweets_vec)

Evaluate model performance using accuracy, precision, recall, and F1 score

In [10]:
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)

In [11]:
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)

print(f"Accuracy:  {round(accuracy, 16)}")
print(f"F1 Score:  {round(f1, 16)}")
print(f"Precision: {round(precision, 16)}")
print(f"Recall:    {round(recall, 16)}")

Accuracy:  0.7073262839879154
F1 Score:  0.2736644798500469
Precision: 0.7684210526315789
Recall:    0.1664766248574686
