In [None]:
%pip install ipywidgets
%pip install pandas
%pip install scikit-learn
%pip install huggingface_hub
%pip install datasets

In [2]:
import pandas as pd
import numpy as np

Load in Dataset Using Huggingface

In [4]:
splits = {'train': 'train.csv', 'test': 'test.csv'}
df = pd.read_csv("hf://datasets/christophsonntag/OLID/" + splits["train"])
df

Unnamed: 0,id,tweet,cleaned_tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,ask native americans take,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,go home drunk maga trump 2020,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,amazon investigating chinese employees selling...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",someone vetaken piece shit volcano,OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,obama wanted liberals illegals move red states,NOT,,
...,...,...,...,...,...,...
13235,95338,@USER Sometimes I get strong vibes from people...,sometimes get strong vibes people man vibe ten...,OFF,TIN,IND
13236,67210,Benidorm ✅ Creamfields ✅ Maga ✅ Not too sh...,benidorm ✅ creamfields ✅ maga ✅ shabby summer,NOT,,
13237,82921,@USER And why report this garbage. We don't g...,report garbage give crap,OFF,TIN,OTH
13238,27429,@USER Pussy,pussy,OFF,UNT,


Create Training and Testing Split on Data

In [5]:
tweets = np.array(df['tweet'].values)
labels = np.where(df['subtask_a'].values == 'OFF', 1, 0)

split = len(tweets) * 0.8
train_tweets = tweets[:int(split)]
train_labels = labels[:int(split)]
test_tweets = tweets[int(split):]
test_labels = labels[int(split):]


Create a dictionary of most common words in offensive tweets

In [6]:
offensive_dict = {}

for i in range(len(train_tweets)):
    if train_labels[i] == 1:
        tweet = train_tweets[i]
        for word in tweet.split():
            if word not in offensive_dict:
                offensive_dict[word] = 1
            else:
                offensive_dict[word] += 1

print(offensive_dict)



In [7]:
sorted_offensive_array = sorted(offensive_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_offensive_array[:20])

[('@USER', 7096), ('the', 2274), ('is', 1772), ('to', 1746), ('a', 1690), ('and', 1303), ('of', 1059), ('are', 968), ('you', 963), ('that', 706), ('I', 698), ('in', 682), ('for', 630), ('he', 495), ('with', 436), ('on', 421), ('URL', 419), ('it', 391), ('not', 385), ('have', 384)]


Filter Out Common Everyday Words

In [8]:
stopwords = {'the', 'is', 'to', 'a', 'and', 'of', 'are', 'you', 'that', 'i', 'in', 
             'for', 'he', 'with', 'on', 'it', 'not', 'have', 'be', 'this', 'was', 'as', 'by',
             'at', 'from', 'or', 'an', 'but', 'all', 'they', 'we', 'there', 'if', 'so',
             'about', 'my', 'your', 'just', 'like', 'what', 'more', 'than', 'when', 'who', 'do', 'can'}

feature_l = 75

filtered_offensive_counts = [item[0] for item in sorted_offensive_array if item[0].lower() not in stopwords]
print(filtered_offensive_counts[:feature_l])

['@USER', 'URL', 'she', 'gun', 'control', 'her', 'will', 'She', 'their', 'his', 'people', '&amp;', 'has', 'liberals', 'out', 'shit', 'up', 'no', 'know', 'how', 'because', '#MAGA', 'get', 'think', 'one', 'should', 'me', 'would', 'Trump', 'Liberals', 'them', 'our', "don't", 'him', 'going', 'why', 'some', 'these', 'been', 'don’t', 'need', 'want', 'Antifa', 'being', 'only', 'even', 'believe', 'go', 'ass', 'never', 'any', 'make', 'conservatives', 'other', 'fucking', 'really', 'say', 'were', 'right', 'good', 'see', 'those', 'us', 'still', 'then', 'fuck', 'now', '-', 'had', 'time', 'left', "I'm", 'stupid', 'Why', 'did']


Turn tweets into vectors of offensive word counts.

In [9]:
filtered_word_index = {item: idx for idx, item in enumerate(filtered_offensive_counts)}

train_tweets_vec = []

for tweet in train_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    train_tweets_vec.append(tweet_vec)

test_tweets_vec = []
for tweet in test_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    test_tweets_vec.append(tweet_vec)

Create a Logistic Regression Model to fit to training data and test on testing data

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
model = LogisticRegression(max_iter=1000, random_state=42)

model.fit(train_tweets_vec, train_labels)
predictions = model.predict(test_tweets_vec)

Evaluate model performance using accuracy, precision, recall, and F1 score

In [12]:
from sklearn.metrics import (accuracy_score, f1_score, precision_score,
                             recall_score)

In [13]:
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)

print(f"Accuracy:  {round(accuracy, 16)}")
print(f"F1 Score:  {round(f1, 16)}")
print(f"Precision: {round(precision, 16)}")
print(f"Recall:    {round(recall, 16)}")

Accuracy:  0.7073262839879154
F1 Score:  0.2736644798500469
Precision: 0.7684210526315789
Recall:    0.1664766248574686


Print confusion matrix for further exploration

In [14]:
from sklearn.metrics import confusion_matrix

In [15]:
print("Confusion Matrix:")
confusion_matrix(test_labels, predictions)

Confusion Matrix:


array([[1727,   44],
       [ 731,  146]])

Predict Whether Offensive Tweets are Targeted or Untargeted

Creating training and testing split

In [28]:
tweets = np.array(df.loc[df['subtask_b'].notnull(), 'tweet'])
labels = np.array(df.loc[df['subtask_b'].notnull(), 'subtask_b'].values)
labels = np.where(labels == 'TIN', 1, 0)  # Convert 'OFF' to 1 and others to 0

split = len(tweets) * 0.8
train_tweets = tweets[:int(split)]
train_labels = labels[:int(split)]
test_tweets = tweets[int(split):]
test_labels = labels[int(split):]

Create Dictionary of most common targeted tweets

In [30]:
offensive_dict = {}

for i in range(len(train_tweets)):
    if train_labels[i] == 1:
        tweet = train_tweets[i]
        for word in tweet.split():
            if word not in offensive_dict:
                offensive_dict[word] = 1
            else:
                offensive_dict[word] += 1

print(offensive_dict)



In [31]:
sorted_offensive_array = sorted(offensive_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_offensive_array[:20])

[('@USER', 6306), ('the', 2078), ('is', 1621), ('to', 1596), ('a', 1558), ('and', 1192), ('of', 972), ('you', 892), ('are', 891), ('that', 643), ('in', 625), ('I', 615), ('for', 576), ('he', 462), ('with', 405), ('on', 376), ('URL', 370), ('have', 357), ('be', 352), ('not', 350)]


In [32]:
stopwords = {'the', 'is', 'to', 'a', 'and', 'of', 'are', 'you', 'that', 'i', 'in', 
             'for', 'he', 'with', 'on', 'it', 'not', 'have', 'be', 'this', 'was', 'as', 'by',
             'at', 'from', 'or', 'an', 'but', 'all', 'they', 'we', 'there', 'if', 'so',
             'about', 'my', 'your', 'just', 'like', 'what', 'more', 'than', 'when', 'who', 'do', 'can'}

feature_l = 75

filtered_offensive_counts = [item[0] for item in sorted_offensive_array if item[0].lower() not in stopwords]
print(filtered_offensive_counts[:feature_l])

['@USER', 'URL', 'she', 'gun', 'will', 'their', 'her', 'She', 'his', 'control', '&amp;', 'people', 'liberals', 'has', 'out', 'up', 'no', 'shit', 'know', 'how', 'because', '#MAGA', 'get', 'think', 'one', 'should', 'Trump', 'Liberals', 'them', 'would', 'me', "don't", 'our', 'him', 'going', 'why', 'these', 'don’t', 'Antifa', 'want', 'even', 'need', 'some', 'been', 'believe', 'go', 'any', 'never', 'only', 'being', 'conservatives', 'make', 'other', 'ass', 'say', 'see', 'really', 'were', 'us', 'good', 'right', 'now', 'those', 'then', 'Why', '-', 'left', 'stupid', 'still', 'had', "I'm", 'time', "it's", 'fucking', 'against']


Turn tweets into vectors of targeted word counts

In [33]:
filtered_word_index = {item: idx for idx, item in enumerate(filtered_offensive_counts)}

train_tweets_vec = []

for tweet in train_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    train_tweets_vec.append(tweet_vec)

test_tweets_vec = []
for tweet in test_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    test_tweets_vec.append(tweet_vec)

Create logistic regression model to fit data

In [34]:
model = LogisticRegression(max_iter=1000, random_state=42)

model.fit(train_tweets_vec, train_labels)
predictions = model.predict(test_tweets_vec)

Evaluate model

In [35]:
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions)
precision = precision_score(test_labels, predictions)
recall = recall_score(test_labels, predictions)

print(f"Accuracy:  {round(accuracy, 16)}")
print(f"F1 Score:  {round(f1, 16)}")
print(f"Precision: {round(precision, 16)}")
print(f"Recall:    {round(recall, 16)}")

Accuracy:  0.8818181818181818
F1 Score:  0.9371980676328503
Precision: 0.8828213879408419
Recall:    0.9987129987129987


Predict whether offensive tweets target individual groups or other

In [38]:
tweets = np.array(df.loc[df['subtask_c'].notnull(), 'tweet'])
labels = np.array(df.loc[df['subtask_c'].notnull(), 'subtask_c'].map({'IND': 0, 'GRP': 1, 'OTH': 2}).values)

split = len(tweets) * 0.8
train_tweets = tweets[:int(split)]
train_labels = labels[:int(split)]
test_tweets = tweets[int(split):]
test_labels = labels[int(split):]

Create dictionary of common groups tweet

In [39]:
offensive_dict = {}

for i in range(len(train_tweets)):
    if train_labels[i] == 1:
        tweet = train_tweets[i]
        for word in tweet.split():
            if word not in offensive_dict:
                offensive_dict[word] = 1
            else:
                offensive_dict[word] += 1

print(offensive_dict)

{'@USER': 2017, 'was': 66, 'literally': 4, 'just': 68, 'talking': 8, 'about': 91, 'this': 109, 'lol': 6, 'all': 129, 'mass': 6, 'shootings': 3, 'like': 86, 'that': 200, 'have': 139, 'been': 28, 'set': 4, 'ups.': 1, 'it’s': 9, 'propaganda': 2, 'used': 7, 'to': 536, 'divide': 2, 'us': 31, 'on': 123, 'major': 2, 'issues': 3, 'gun': 115, 'control': 81, 'and': 400, 'terrorism': 1, 'Kind': 1, 'of': 343, 'when': 33, 'conservatives': 43, 'wanna': 3, 'associate': 1, 'everyone': 8, 'their': 119, 'left': 26, 'as': 68, 'communist': 4, 'antifa': 23, 'members?': 1, 'Da': 1, 'fuck': 7, 'is': 359, 'going': 25, 'people?': 1, "There's": 5, 'the': 698, "men's": 1, 'room': 3, "women's": 2, 'Pick': 1, 'one': 37, 'stick': 2, 'w': 2, 'it': 93, '🤔': 2, 'The': 97, 'only': 37, 'thing': 16, 'Democrats': 28, 'lying': 14, 'stalling': 1, 'stop': 16, 'Trump': 24, 'from': 59, 'being': 18, '#President.': 1, 'What': 20, 'they': 179, 'done': 11, 'for': 197, 'you': 183, 'lately.': 1, '#Trump': 2, '#Kavanaugh': 6, '#MAGA'

In [40]:
sorted_offensive_array = sorted(offensive_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_offensive_array[:20])

[('@USER', 2017), ('the', 698), ('to', 536), ('and', 400), ('is', 359), ('of', 343), ('are', 340), ('a', 335), ('that', 200), ('for', 197), ('in', 191), ('you', 183), ('they', 179), ('I', 145), ('have', 139), ('all', 129), ('on', 123), ('URL', 123), ('liberals', 123), ('their', 119)]


In [41]:
stopwords = {'the', 'is', 'to', 'a', 'and', 'of', 'are', 'you', 'that', 'i', 'in', 
             'for', 'he', 'with', 'on', 'it', 'not', 'have', 'be', 'this', 'was', 'as', 'by',
             'at', 'from', 'or', 'an', 'but', 'all', 'they', 'we', 'there', 'if', 'so',
             'about', 'my', 'your', 'just', 'like', 'what', 'more', 'than', 'when', 'who', 'do', 'can'}

feature_l = 75

filtered_offensive_counts = [item[0] for item in sorted_offensive_array if item[0].lower() not in stopwords]
print(filtered_offensive_counts[:feature_l])

['@USER', 'URL', 'liberals', 'their', 'gun', 'people', '&amp;', 'control', 'Liberals', 'will', 'them', 'these', 'up', 'out', 'no', '#MAGA', 'she', 'get', 'has', 'know', 'how', 'conservatives', 'Antifa', 'our', "don't", 'because', 'one', 'only', 'why', 'want', 'think', 'his', 'believe', 'any', 'should', 'some', 'us', 'would', 'go', 'need', 'other', 'Conservatives', 'really', 'been', 'Democrats', 'make', 'shit', "it's", 'white', 'way', 'her', 'women', 'left', 'most', 'hate', 'going', 'don’t', 'even', 'never', 'Trump', 'many', 'right', 'me', 'see', 'those', 'antifa', 'into', 'No', 'stupid', 'now', 'had', 'same', 'good', 'liberal', 'against']


Turn tweets into vectors

In [42]:
filtered_word_index = {item: idx for idx, item in enumerate(filtered_offensive_counts)}

train_tweets_vec = []

for tweet in train_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    train_tweets_vec.append(tweet_vec)

test_tweets_vec = []
for tweet in test_tweets:
    tweet_vec = [0] * feature_l
    for word in tweet.split():
        if word in filtered_word_index:
            idx = filtered_word_index[word]
            if idx < feature_l:
                tweet_vec[idx] += 1
    test_tweets_vec.append(tweet_vec)

Create model

In [43]:
model = LogisticRegression(max_iter=1000, random_state=42)

model.fit(train_tweets_vec, train_labels)
predictions = model.predict(test_tweets_vec)

Evaluate model

In [49]:
accuracy = accuracy_score(test_labels, predictions)
f1 = f1_score(test_labels, predictions, average='weighted')
precision = precision_score(test_labels, predictions, average='weighted')
recall = recall_score(test_labels, predictions, average='weighted')

print(f"Accuracy:  {round(accuracy, 16)}")
print(f"F1 Score:  {round(f1, 16)}")
print(f"Precision: {round(precision, 16)}")
print(f"Recall:    {round(recall, 16)}")

Accuracy:  0.663659793814433
F1 Score:  0.6157377577319587
Precision: 0.5880180979154522
Recall:    0.663659793814433
