In [15]:
import torch
from tqdm import tqdm
import numpy as np
import pandas as pd
from torch.nn.functional import softmax
from sklearn.metrics import classification_report
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [4]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("arash-rasouli/BERT-offensive-tweet-classification")
model.to(device)
model.eval()

In [10]:
def classify_tweets(tweets, batch_size=32):
    all_preds = []
    for i in tqdm(range(0, len(tweets), batch_size)):
        batch = tweets[i:i+batch_size].tolist()
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = softmax(outputs.logits, dim=1).cpu().numpy()
            predictions = probs.argmax(axis=1)
            all_preds.extend(predictions)
    return all_preds

### OffensiveLang

In [5]:
!git clone https://github.com/AmitDasRup123/OffensiveLang.git
%cd OffensiveLang

Cloning into 'OffensiveLang'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (41/41), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 41 (delta 21), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects: 100% (41/41), 339.34 KiB | 2.29 MiB/s, done.
/home/mb05005/mudit/Offensive/OffensiveLang


In [6]:
offlang = pd.read_csv("OffensiveLang.csv")
offlang.head()

Unnamed: 0,Text,Category,Target Group,Final Annotation,OpenAI_Annotation,Annotator1,Annotator2,Annotator3
0,Your recent performances are lacking the spark...,Occupation,Actor,Offensive,Not Offensive,Not Offensive,Offensive,Offensive
1,I heard your agent is struggling to find you q...,Occupation,Actor,Offensive,Offensive,Offensive,Offensive,Offensive
2,Your character lacked depth; it felt like a ca...,Occupation,Actor,Offensive,Not Offensive,Offensive,Offensive,Offensive
3,You were overshadowed by the set design; it wa...,Occupation,Actor,Offensive,Offensive,Offensive,Offensive,Not Offensive
4,You're not as talented as [another actor] in t...,Occupation,Actor,Offensive,Offensive,Offensive,Not Offensive,Offensive


In [7]:
offlang['prediction'] = classify_tweets(offlang['Text'])
offlang.head()

100%|█████████████████████████████████████████| 259/259 [00:06<00:00, 41.49it/s]


Unnamed: 0,Text,Category,Target Group,Final Annotation,OpenAI_Annotation,Annotator1,Annotator2,Annotator3,prediction
0,Your recent performances are lacking the spark...,Occupation,Actor,Offensive,Not Offensive,Not Offensive,Offensive,Offensive,0
1,I heard your agent is struggling to find you q...,Occupation,Actor,Offensive,Offensive,Offensive,Offensive,Offensive,0
2,Your character lacked depth; it felt like a ca...,Occupation,Actor,Offensive,Not Offensive,Offensive,Offensive,Offensive,0
3,You were overshadowed by the set design; it wa...,Occupation,Actor,Offensive,Offensive,Offensive,Offensive,Not Offensive,0
4,You're not as talented as [another actor] in t...,Occupation,Actor,Offensive,Offensive,Offensive,Not Offensive,Offensive,0


In [8]:
offlang['prediction'].value_counts()

prediction
0    7286
1     984
Name: count, dtype: int64

In [9]:
offlang['true_label'] = offlang['Final Annotation'].map({"Offensive" : 1, "Not Offensive" : 0})
offlang.head()

Unnamed: 0,Text,Category,Target Group,Final Annotation,OpenAI_Annotation,Annotator1,Annotator2,Annotator3,prediction,true_label
0,Your recent performances are lacking the spark...,Occupation,Actor,Offensive,Not Offensive,Not Offensive,Offensive,Offensive,0,1
1,I heard your agent is struggling to find you q...,Occupation,Actor,Offensive,Offensive,Offensive,Offensive,Offensive,0,1
2,Your character lacked depth; it felt like a ca...,Occupation,Actor,Offensive,Not Offensive,Offensive,Offensive,Offensive,0,1
3,You were overshadowed by the set design; it wa...,Occupation,Actor,Offensive,Offensive,Offensive,Offensive,Not Offensive,0,1
4,You're not as talented as [another actor] in t...,Occupation,Actor,Offensive,Offensive,Offensive,Not Offensive,Offensive,0,1


In [11]:
from sklearn.metrics import classification_report
print(classification_report(y_true=offlang['true_label'], y_pred=offlang['prediction'], target_names=['Not Offensive','Offensive']))

               precision    recall  f1-score   support

Not Offensive       0.22      0.93      0.36      1748
    Offensive       0.88      0.13      0.23      6522

     accuracy                           0.30      8270
    macro avg       0.55      0.53      0.29      8270
 weighted avg       0.74      0.30      0.26      8270



In [15]:
offlang.to_csv('arash_rasouli_offlang.csv',index=False)

### TDavidson

In [1]:
from datasets import load_dataset

ds = load_dataset("tdavidson/hate_speech_offensive")

In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['count', 'hate_speech_count', 'offensive_language_count', 'neither_count', 'class', 'tweet'],
        num_rows: 24783
    })
})

In [6]:
td = ds['train'].to_pandas()
td.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [7]:
td['true_label'] = td['class'].apply(lambda x: 1 if x==1 else 0)
td.head()

Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet,true_label
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,0
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1


In [13]:
td['prediction'] = classify_tweets(td['tweet'])
td.head()

100%|█████████████████████████████████████████| 775/775 [00:32<00:00, 23.80it/s]


Unnamed: 0,count,hate_speech_count,offensive_language_count,neither_count,class,tweet,true_label,prediction
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,0,1
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,1
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,1
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,0
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,1


In [17]:
print(classification_report(td['true_label'],td['prediction'],target_names=['Not Offensive','Offensive']))

               precision    recall  f1-score   support

Not Offensive       0.53      0.63      0.57      5593
    Offensive       0.88      0.84      0.86     19190

     accuracy                           0.79     24783
    macro avg       0.71      0.73      0.72     24783
 weighted avg       0.80      0.79      0.80     24783

