In [2]:
import pandas as pd
from datasets import load_dataset

dataset_name = "krishan-CSE/HatEval-Relabeled"
dataset = load_dataset(dataset_name)

df_train = dataset['train'].to_pandas()
df_valid = dataset['validation'].to_pandas()
df_test = dataset['test'].to_pandas()

Downloading readme: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
Downloading data: 100%|██████████| 1.13M/1.13M [00:01<00:00, 847kB/s]
Downloading data: 100%|██████████| 145k/145k [00:00<00:00, 180kB/s]t]
Downloading data: 100%|██████████| 340k/340k [00:00<00:00, 483kB/s]t]
Downloading data files: 100%|██████████| 3/3 [00:02<00:00,  1.01it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 166.41it/s]
Generating train split: 9088 examples [00:00, 53755.84 examples/s]
Generating validation split: 1168 examples [00:00, 36184.23 examples/s]
Generating test split: 2724 examples [00:00, 49060.61 examples/s]


In [3]:
print(df_train.shape)
print("train value_counts:\n", df_train['labels'].value_counts())
print(df_train.head())
print("=========================================="
      "==========================================")
print(df_valid.shape)
print("validation value_counts:\n", df_valid['labels'].value_counts())
print(df_valid.head())
print("=========================================="
      "==========================================")
print(df_test.shape)
print("test value_counts:\n", df_test['labels'].value_counts())
print(df_test.head())

(9088, 2)
train value_counts:
 labels
0    4811
1    4277
Name: count, dtype: int64
                                                text  labels
0  This human-elephant conflict has seen 13 refug...       0
1  The awkward moment when Lexus is showing you h...       0
2            People- why are you so fucking mean Me-       0
3  After EU uses Turkey as buffer to stop refugee...       0
4                           Immigration in a picture       0
(1168, 2)
validation value_counts:
 labels
0    618
1    550
Name: count, dtype: int64
                                                text  labels
0  President Jokowi: it's not true millions of Ch...       0
1  So you created the problem by mass immigration...       1
2  I though in a free country you could worship w...       0
3  WELP. Bitch IM JUST NOW FUCKING SEEING DUMB WHORE       1
4  .Considering THIS , the filth on the streets o...       1
(2724, 2)
test value_counts:
 labels
0    1442
1    1282
Name: count, dtype: int64
              

In [4]:
import re
from textblob import TextBlob
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import emoji 
def average_word_length(tweet):
    words = tweet.split()
    return sum(len(word) for word in words) / len(words)

# def average_sentence_length(tweet):
#     sentences = re.split(r'[.!?]+', tweet)
#     return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

def lexical_diversity(tweet):
    words = tweet.split()
    unique_words = set(words)
    return len(unique_words) / len(words)

def count_capital_letters(tweet):
    return sum(1 for char in tweet if char.isupper())

def count_words_surrounded_by_colons(tweet):
    # Define a regular expression pattern to match words surrounded by ':'
    pattern = r':(\w+):'

    # Use re.findall to find all matches in the tweet
    matches = re.findall(pattern, tweet)

    # Return the count of matched words
    return len(matches)

def count_emojis(tweet):
    # Convert emoji symbols to their corresponding names
    tweet_with_names = emoji.demojize(tweet)
    return count_words_surrounded_by_colons(tweet_with_names)

def hashtag_frequency(tweet):
    hashtags = re.findall(r'#\w+', tweet)
    return len(hashtags)

def mention_frequency(tweet):
    mentions = re.findall(r'@\w+', tweet)
    return len(mentions)

import string

def count_special_characters(tweet):
    special_characters = [char for char in tweet if char in string.punctuation]
    return len(special_characters)

# def capitalization_pattern(tweet):
#     if tweet.islower():
#         return 'All Lowercase'
#     elif tweet.isupper():
#         return 'All Uppercase'
#     elif tweet.istitle():
#         return 'Title Case'
#     else:
#         return 'Mixed Case'

def stop_word_frequency(tweet):
    stop_words = set(stopwords.words('english'))
    words = [word for word in tweet.split() if word.lower() in stop_words]
    return len(words)

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_linguistic_features(tweet):
    # Tokenize the tweet
    words = word_tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Get parts of speech tags
    pos_tags = pos_tag(filtered_words)

    # Count various linguistic features
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
    participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
    interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
    pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
    preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
    adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
    conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))

    return {
        'Noun_Count': noun_count,
        'Verb_Count': verb_count,
        'Participle_Count': participle_count,
        'Interjection_Count': interjection_count,
        'Pronoun_Count': pronoun_count,
        'Preposition_Count': preposition_count,
        'Adverb_Count': adverb_count,
        'Conjunction_Count': conjunction_count
    }

import textstat
def readability_score(tweet):
    return textstat.flesch_reading_ease(tweet)

def get_url_frequency(tweet):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
    return len(urls)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Training

In [5]:
# !pip install scikit-learn textblob nltk

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Extract features and target variable
X = df_train.drop('labels', axis=1)
y = df_train['labels']

In [7]:
y

0       0
1       0
2       0
3       0
4       0
       ..
9083    1
9084    1
9085    0
9086    1
9087    1
Name: labels, Length: 9088, dtype: int64

In [8]:
import pandas as pd

# Define a function to extract features from a single tweet
def extract_features(tweet):
    features = {
        'Average_Word_Length': average_word_length(tweet),
        # 'Average_Sentence_Length': average_sentence_length(tweet),
        'Lexical_Diversity': lexical_diversity(tweet),
        'Capital_Letters_Count': count_capital_letters(tweet),  # Uncomment if you want to include this feature
        'Hashtag_Frequency': hashtag_frequency(tweet),
        'Mention_Frequency': mention_frequency(tweet),
        'count_emojis': count_emojis(tweet),
        'special_chars_count': count_special_characters(tweet),
        'Stop_Word_Frequency': stop_word_frequency(tweet),
        **get_linguistic_features(tweet),  # Include linguistic features
        'Readability_Score': readability_score(tweet),
        'URL_Frequency': get_url_frequency(tweet)  # Assuming you have the correct function for this
    }
    return features

# Extract features for all tweets
features_list = [extract_features(tweet) for tweet in X['text']]

# Create a Pandas DataFrame
X_new = pd.DataFrame(features_list)

# # Add the labels to the DataFrame
# df['Label'] = labels

# # Display the DataFrame
# print(df)

# The progress from beginning
# The initial phase is to do a comprehensive literature review on the domain of hate speech detection. Since our group consists of 3 people, we could collectively gothrough more than 50 papers and as a group we discussed and share knowledge between ourselves to collectively gain and improve our knowledge in this domain. While going through the papers, we collected the openly avaialable datasets to help in our initial experiments. We went through different models, features, intepreatbility techniques, evaluationmetrics, and pre-processing techniques. We also went through the different types of hate speech and 

In [9]:
X_new

Unnamed: 0,Average_Word_Length,Lexical_Diversity,Capital_Letters_Count,Hashtag_Frequency,Mention_Frequency,count_emojis,special_chars_count,Stop_Word_Frequency,Noun_Count,Verb_Count,Participle_Count,Interjection_Count,Pronoun_Count,Preposition_Count,Adverb_Count,Conjunction_Count,Readability_Score,URL_Frequency
0,6.894737,1.000000,3,4,0,0,6,3,6,2,1,0,0,1,1,0,78.75,0
1,4.437500,1.000000,2,0,0,0,3,7,4,3,1,0,0,0,0,0,80.62,0
2,4.000000,1.000000,2,0,0,0,2,4,1,1,1,0,0,0,0,0,88.74,0
3,4.777778,0.844444,10,0,0,0,6,15,14,11,4,0,0,0,0,0,42.72,0
4,5.250000,1.000000,1,0,0,0,0,2,2,0,0,0,0,0,0,0,33.58,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9083,4.619048,0.857143,4,1,0,0,1,9,6,2,1,0,0,0,2,0,67.08,0
9084,8.700000,1.000000,9,3,0,0,4,4,4,2,0,0,0,0,0,0,10.56,0
9085,6.000000,0.970588,10,0,0,0,14,10,8,5,2,0,0,0,3,0,54.22,0
9086,3.428571,0.952381,3,0,0,0,2,11,4,1,0,0,0,0,1,0,100.92,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)


In [11]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.62

Classification Report:
              precision    recall  f1-score   support

           0       0.61      0.74      0.67       948
           1       0.64      0.49      0.55       870

    accuracy                           0.62      1818
   macro avg       0.63      0.62      0.61      1818
weighted avg       0.62      0.62      0.62      1818



# For test dataset

In [12]:
df_test.head()

Unnamed: 0,text,labels
0,We have got to get these Obama DACA illegal al...,1
1,The same bitch is all on my boos shit like gir...,0
2,BS WILSON IS A SKANK WHORE AND A LIAR . DIDDN'...,1
3,Immigration Expert: Trudeau Has Lost Track Of ...,1
4,I like to delete comments that say 'first' to ...,0


In [13]:
features_list_test = [extract_features(tweet) for tweet in df_test['text']]
X_test_new = pd.DataFrame(features_list_test)
X_test_scaled = scaler.transform(X_test_new)
y_test_pred = model.predict(X_test_scaled)
accuracy_test = accuracy_score(df_test['labels'], y_test_pred)
print(f"Test Accuracy: {accuracy_test:.2f}")
print("\nTest Classification Report:")
print(classification_report(df_test['labels'], y_test_pred))

Test Accuracy: 0.66

Test Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.76      0.70      1442
           1       0.67      0.54      0.60      1282

    accuracy                           0.66      2724
   macro avg       0.66      0.65      0.65      2724
weighted avg       0.66      0.66      0.65      2724

