In [1]:
import pandas as pd
from datasets import load_dataset

dataset_name = "krishan-CSE/HatEval-Relabeled"
dataset = load_dataset(dataset_name)

df_train = dataset['train'].to_pandas()
df_valid = dataset['validation'].to_pandas()
df_test = dataset['test'].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(df_train.shape)
print("train value_counts:\n", df_train['labels'].value_counts())
print(df_train.head())
print("=========================================="
      "==========================================")
print(df_valid.shape)
print("validation value_counts:\n", df_valid['labels'].value_counts())
print(df_valid.head())
print("=========================================="
      "==========================================")
print(df_test.shape)
print("test value_counts:\n", df_test['labels'].value_counts())
print(df_test.head())

(9088, 2)
train value_counts:
 labels
0    4811
1    4277
Name: count, dtype: int64
                                                text  labels
0  This human-elephant conflict has seen 13 refug...       0
1  The awkward moment when Lexus is showing you h...       0
2            People- why are you so fucking mean Me-       0
3  After EU uses Turkey as buffer to stop refugee...       0
4                           Immigration in a picture       0
(1168, 2)
validation value_counts:
 labels
0    618
1    550
Name: count, dtype: int64
                                                text  labels
0  President Jokowi: it's not true millions of Ch...       0
1  So you created the problem by mass immigration...       1
2  I though in a free country you could worship w...       0
3  WELP. Bitch IM JUST NOW FUCKING SEEING DUMB WHORE       1
4  .Considering THIS , the filth on the streets o...       1
(2724, 2)
test value_counts:
 labels
0    1442
1    1282
Name: count, dtype: int64
              

In [3]:
import re
from textblob import TextBlob
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import emoji 
def average_word_length(tweet):
    words = tweet.split()
    return sum(len(word) for word in words) / len(words)

# def average_sentence_length(tweet):
#     sentences = re.split(r'[.!?]+', tweet)
#     return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

def lexical_diversity(tweet):
    words = tweet.split()
    unique_words = set(words)
    return len(unique_words) / len(words)

def count_capital_letters(tweet):
    return sum(1 for char in tweet if char.isupper())

def count_words_surrounded_by_colons(tweet):
    # Define a regular expression pattern to match words surrounded by ':'
    pattern = r':(\w+):'

    # Use re.findall to find all matches in the tweet
    matches = re.findall(pattern, tweet)

    # Return the count of matched words
    return len(matches)

def count_emojis(tweet):
    # Convert emoji symbols to their corresponding names
    tweet_with_names = emoji.demojize(tweet)
    return count_words_surrounded_by_colons(tweet_with_names)

def hashtag_frequency(tweet):
    hashtags = re.findall(r'#\w+', tweet)
    return len(hashtags)

def mention_frequency(tweet):
    mentions = re.findall(r'@\w+', tweet)
    return len(mentions)

import string

def count_special_characters(tweet):
    special_characters = [char for char in tweet if char in string.punctuation]
    return len(special_characters)

# def capitalization_pattern(tweet):
#     if tweet.islower():
#         return 'All Lowercase'
#     elif tweet.isupper():
#         return 'All Uppercase'
#     elif tweet.istitle():
#         return 'Title Case'
#     else:
#         return 'Mixed Case'

def stop_word_frequency(tweet):
    stop_words = set(stopwords.words('english'))
    words = [word for word in tweet.split() if word.lower() in stop_words]
    return len(words)

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_linguistic_features(tweet):
    # Tokenize the tweet
    words = word_tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Get parts of speech tags
    pos_tags = pos_tag(filtered_words)

    # Count various linguistic features
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
    participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
    interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
    pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
    preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
    adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
    conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))

    return {
        'Noun_Count': noun_count,
        'Verb_Count': verb_count,
        'Participle_Count': participle_count,
        'Interjection_Count': interjection_count,
        'Pronoun_Count': pronoun_count,
        'Preposition_Count': preposition_count,
        'Adverb_Count': adverb_count,
        'Conjunction_Count': conjunction_count
    }

import textstat
def readability_score(tweet):
    return textstat.flesch_reading_ease(tweet)

def get_url_frequency(tweet):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
    return len(urls)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df_train

Unnamed: 0,text,labels
0,This human-elephant conflict has seen 13 refug...,0
1,The awkward moment when Lexus is showing you h...,0
2,People- why are you so fucking mean Me-,0
3,After EU uses Turkey as buffer to stop refugee...,0
4,Immigration in a picture,0
...,...,...
9083,Ladies the moment you start hanging around wit...,1
9084,The #AmericanCommunists who live in our countr...,1
9085,_x0081__x0081_ Babels are also looking for a p...,0
9086,"I can lose weight, but youll always be a cunt ...",1


In [8]:
import pandas as pd

# Define a function to extract features from a single tweet
def extract_features(tweet):
    features = {
        'Average_Word_Length': average_word_length(tweet),
        # 'Average_Sentence_Length': average_sentence_length(tweet),
        'Lexical_Diversity': lexical_diversity(tweet),
        'Capital_Letters_Count': count_capital_letters(tweet),  # Uncomment if you want to include this feature
        'Hashtag_Frequency': hashtag_frequency(tweet),
        'Mention_Frequency': mention_frequency(tweet),
        'count_emojis': count_emojis(tweet),
        'special_chars_count': count_special_characters(tweet),
        'Stop_Word_Frequency': stop_word_frequency(tweet),
        **get_linguistic_features(tweet),  # Include linguistic features
        'Readability_Score': readability_score(tweet),
        'URL_Frequency': get_url_frequency(tweet)  # Assuming you have the correct function for this
    }
    return features

# Extract features for all tweets
features_list = [extract_features(tweet) for tweet in df_train['text']]

# Create a Pandas DataFrame
features_list = pd.DataFrame(features_list)




In [9]:
features_list

Unnamed: 0,Average_Word_Length,Lexical_Diversity,Capital_Letters_Count,Hashtag_Frequency,Mention_Frequency,count_emojis,special_chars_count,Stop_Word_Frequency,Noun_Count,Verb_Count,Participle_Count,Interjection_Count,Pronoun_Count,Preposition_Count,Adverb_Count,Conjunction_Count,Readability_Score,URL_Frequency
0,6.894737,1.000000,3,4,0,0,6,3,6,2,1,0,0,1,1,0,78.75,0
1,4.437500,1.000000,2,0,0,0,3,7,4,3,1,0,0,0,0,0,80.62,0
2,4.000000,1.000000,2,0,0,0,2,4,1,1,1,0,0,0,0,0,88.74,0
3,4.777778,0.844444,10,0,0,0,6,15,14,11,4,0,0,0,0,0,42.72,0
4,5.250000,1.000000,1,0,0,0,0,2,2,0,0,0,0,0,0,0,33.58,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9083,4.619048,0.857143,4,1,0,0,1,9,6,2,1,0,0,0,2,0,67.08,0
9084,8.700000,1.000000,9,3,0,0,4,4,4,2,0,0,0,0,0,0,10.56,0
9085,6.000000,0.970588,10,0,0,0,14,10,8,5,2,0,0,0,3,0,54.22,0
9086,3.428571,0.952381,3,0,0,0,2,11,4,1,0,0,0,0,1,0,100.92,0


In [10]:
# merge the features with the original dataframe
df_train = pd.concat([df_train, features_list], axis=1)

Unnamed: 0,text,labels,Average_Word_Length,Lexical_Diversity,Capital_Letters_Count,Hashtag_Frequency,Mention_Frequency,count_emojis,special_chars_count,Stop_Word_Frequency,Noun_Count,Verb_Count,Participle_Count,Interjection_Count,Pronoun_Count,Preposition_Count,Adverb_Count,Conjunction_Count,Readability_Score,URL_Frequency
0,This human-elephant conflict has seen 13 refug...,0,6.894737,1.000000,3,4,0,0,6,3,6,2,1,0,0,1,1,0,78.75,0
1,The awkward moment when Lexus is showing you h...,0,4.437500,1.000000,2,0,0,0,3,7,4,3,1,0,0,0,0,0,80.62,0
2,People- why are you so fucking mean Me-,0,4.000000,1.000000,2,0,0,0,2,4,1,1,1,0,0,0,0,0,88.74,0
3,After EU uses Turkey as buffer to stop refugee...,0,4.777778,0.844444,10,0,0,0,6,15,14,11,4,0,0,0,0,0,42.72,0
4,Immigration in a picture,0,5.250000,1.000000,1,0,0,0,0,2,2,0,0,0,0,0,0,0,33.58,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9083,Ladies the moment you start hanging around wit...,1,4.619048,0.857143,4,1,0,0,1,9,6,2,1,0,0,0,2,0,67.08,0
9084,The #AmericanCommunists who live in our countr...,1,8.700000,1.000000,9,3,0,0,4,4,4,2,0,0,0,0,0,0,10.56,0
9085,_x0081__x0081_ Babels are also looking for a p...,0,6.000000,0.970588,10,0,0,0,14,10,8,5,2,0,0,0,3,0,54.22,0
9086,"I can lose weight, but youll always be a cunt ...",1,3.428571,0.952381,3,0,0,0,2,11,4,1,0,0,0,0,1,0,100.92,0


In [11]:
# save the dataframe
df_train.to_csv('train.csv', index=False)

In [12]:
# do the same for the validation set
features_list = [extract_features(tweet) for tweet in df_valid['text']]
features_list = pd.DataFrame(features_list)
df_valid = pd.concat([df_valid, features_list], axis=1)
df_valid

Unnamed: 0,text,labels,Average_Word_Length,Lexical_Diversity,Capital_Letters_Count,Hashtag_Frequency,Mention_Frequency,count_emojis,special_chars_count,Stop_Word_Frequency,Noun_Count,Verb_Count,Participle_Count,Interjection_Count,Pronoun_Count,Preposition_Count,Adverb_Count,Conjunction_Count,Readability_Score,URL_Frequency
0,President Jokowi: it's not true millions of Ch...,0,5.333333,0.717949,9,0,0,0,12,13,10,3,1,0,0,0,0,0,69.99,0
1,So you created the problem by mass immigration...,1,5.866667,0.966667,4,0,0,0,13,13,10,2,1,0,0,0,1,0,46.94,0
2,I though in a free country you could worship w...,0,4.043478,0.869565,3,0,0,0,3,10,5,2,0,0,0,1,0,0,89.04,0
3,WELP. Bitch IM JUST NOW FUCKING SEEING DUMB WHORE,1,4.555556,1.000000,36,0,0,0,1,2,4,2,2,0,0,0,0,0,96.18,0
4,".Considering THIS , the filth on the streets o...",1,5.838710,0.935484,15,2,0,0,13,13,6,3,1,0,0,0,0,0,52.87,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,Basically you don't care about rape or victims...,0,4.560000,0.960000,5,0,0,0,3,12,7,3,0,0,0,0,1,0,75.71,0
1164,How is #AndrewGillum plan on paying for the Mi...,1,5.131579,0.894737,15,6,0,0,10,20,9,6,3,0,0,1,0,0,67.04,0
1165,Really do tell! Hysterical? Strange tweet you ...,0,5.357143,1.000000,7,1,0,0,4,3,2,3,1,0,0,0,1,0,58.24,0
1166,In NY? Check out Immigrant Arts Coalition Summit,0,5.125000,1.000000,8,0,0,0,1,2,4,0,0,0,0,0,1,0,71.82,0


In [13]:
df_valid.to_csv('valid.csv', index=False)

In [14]:
# Do the same for the test set
features_list = [extract_features(tweet) for tweet in df_test['text']]
features_list = pd.DataFrame(features_list)
df_test = pd.concat([df_test, features_list], axis=1)
df_test.head()

Unnamed: 0,text,labels,Average_Word_Length,Lexical_Diversity,Capital_Letters_Count,Hashtag_Frequency,Mention_Frequency,count_emojis,special_chars_count,Stop_Word_Frequency,Noun_Count,Verb_Count,Participle_Count,Interjection_Count,Pronoun_Count,Preposition_Count,Adverb_Count,Conjunction_Count,Readability_Score,URL_Frequency
0,We have got to get these Obama DACA illegal al...,1,5.794118,0.970588,19,4,0,0,8,16,9,5,0,0,0,0,0,0,54.22,0
1,The same bitch is all on my boos shit like gir...,0,3.333333,1.0,1,0,0,0,0,6,5,0,0,0,0,1,0,0,110.06,0
2,BS WILSON IS A SKANK WHORE AND A LIAR . DIDDN'...,1,3.625,0.9375,56,0,0,0,2,6,4,3,0,0,0,0,1,0,89.24,0
3,Immigration Expert: Trudeau Has Lost Track Of ...,1,5.033333,0.933333,20,0,0,0,4,10,10,3,1,0,0,0,0,0,49.49,0
4,I like to delete comments that say 'first' to ...,0,4.5,0.916667,1,0,0,0,2,5,2,1,0,0,0,1,0,0,84.68,0


In [15]:
df_test.to_csv('test.csv', index=False)