In [2]:
import pandas as pd
from datasets import load_dataset

dataset_name = "krishan-CSE/HatEval_New"
dataset = load_dataset(dataset_name)

df_train = dataset['train'].to_pandas()
df_valid = dataset['validation'].to_pandas()
df_test = dataset['test'].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(df_train.shape)
print("train value_counts:\n", df_train['labels'].value_counts())
print(df_train.head())
print("=========================================="
      "==========================================")
print(df_valid.shape)
print("validation value_counts:\n", df_valid['labels'].value_counts())
print(df_valid.head())
print("=========================================="
      "==========================================")
print(df_test.shape)
print("test value_counts:\n", df_test['labels'].value_counts())
print(df_test.head())

(8982, 2)
train value_counts:
 labels
0    5201
1    3781
Name: count, dtype: int64
                                                text  labels
0  Hurray, saving us $$$ in so many ways #LockThe...       1
1  Why would young fighting age men be the vast m...       1
2  Illegals Dump their Kids at the border like Ro...       1
3  NY Times: 'Nearly All White' States Pose 'an A...       0
4  Orban in Brussels: European leaders are ignori...       0
(998, 2)
validation value_counts:
 labels
0    571
1    427
Name: count, dtype: int64
                                                text  labels
0  I swear Im getting to places just in the nick ...       0
1  Im an immigrant and Trump is right on immigrat...       0
2  #IllegalImmigrants #IllegalAliens #ElectoralSy...       1
3  We have our own invasion issues with Mexicans....       1
4  Worker Charged With Sexually Molesting Eight C...       0
(3000, 2)
test value_counts:
 labels
1    1901
0    1099
Name: count, dtype: int64
               

In [3]:
import re
from textblob import TextBlob
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords
import emoji 
def average_word_length(tweet):
    words = tweet.split()
    return sum(len(word) for word in words) / len(words)

# def average_sentence_length(tweet):
#     sentences = re.split(r'[.!?]+', tweet)
#     return sum(len(word_tokenize(sentence)) for sentence in sentences) / len(sentences)

def lexical_diversity(tweet):
    words = tweet.split()
    unique_words = set(words)
    return len(unique_words) / len(words)

def count_capital_letters(tweet):
    return sum(1 for char in tweet if char.isupper())

def count_words_surrounded_by_colons(tweet):
    # Define a regular expression pattern to match words surrounded by ':'
    pattern = r':(\w+):'

    # Use re.findall to find all matches in the tweet
    matches = re.findall(pattern, tweet)

    # Return the count of matched words
    return len(matches)

def count_emojis(tweet):
    # Convert emoji symbols to their corresponding names
    tweet_with_names = emoji.demojize(tweet)
    return count_words_surrounded_by_colons(tweet_with_names)

def hashtag_frequency(tweet):
    hashtags = re.findall(r'#\w+', tweet)
    return len(hashtags)

def mention_frequency(tweet):
    mentions = re.findall(r'@\w+', tweet)
    return len(mentions)

import string

def count_special_characters(tweet):
    special_characters = [char for char in tweet if char in string.punctuation]
    return len(special_characters)

# def capitalization_pattern(tweet):
#     if tweet.islower():
#         return 'All Lowercase'
#     elif tweet.isupper():
#         return 'All Uppercase'
#     elif tweet.istitle():
#         return 'Title Case'
#     else:
#         return 'Mixed Case'

def stop_word_frequency(tweet):
    stop_words = set(stopwords.words('english'))
    words = [word for word in tweet.split() if word.lower() in stop_words]
    return len(words)

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def get_linguistic_features(tweet):
    # Tokenize the tweet
    words = word_tokenize(tweet)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word.lower() for word in words if word.isalnum() and word.lower() not in stop_words]

    # Get parts of speech tags
    pos_tags = pos_tag(filtered_words)

    # Count various linguistic features
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('N'))
    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('V'))
    participle_count = sum(1 for word, pos in pos_tags if pos.startswith('V') and ('ing' in word or 'ed' in word))
    interjection_count = sum(1 for word, pos in pos_tags if pos == 'UH')
    pronoun_count = sum(1 for word, pos in pos_tags if pos.startswith('PRP'))
    preposition_count = sum(1 for word, pos in pos_tags if pos.startswith('IN'))
    adverb_count = sum(1 for word, pos in pos_tags if pos.startswith('RB'))
    conjunction_count = sum(1 for word, pos in pos_tags if pos.startswith('CC'))

    return {
        'Noun_Count': noun_count,
        'Verb_Count': verb_count,
        'Participle_Count': participle_count,
        'Interjection_Count': interjection_count,
        'Pronoun_Count': pronoun_count,
        'Preposition_Count': preposition_count,
        'Adverb_Count': adverb_count,
        'Conjunction_Count': conjunction_count
    }

import textstat
def readability_score(tweet):
    return textstat.flesch_reading_ease(tweet)

def get_url_frequency(tweet):
    urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet)
    return len(urls)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Owner\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Training

In [4]:
# !pip install scikit-learn textblob nltk

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Extract features and target variable
X = df_train.drop('labels', axis=1)
y = df_train['labels']

In [6]:
y

0       1
1       1
2       1
3       0
4       0
       ..
8977    0
8978    0
8979    1
8980    0
8981    0
Name: labels, Length: 8982, dtype: int64

In [7]:
import pandas as pd

# Define a function to extract features from a single tweet
def extract_features(tweet):
    features = {
        'Average_Word_Length': average_word_length(tweet),
        # 'Average_Sentence_Length': average_sentence_length(tweet),
        'Lexical_Diversity': lexical_diversity(tweet),
        'Capital_Letters_Count': count_capital_letters(tweet),  # Uncomment if you want to include this feature
        'Hashtag_Frequency': hashtag_frequency(tweet),
        'Mention_Frequency': mention_frequency(tweet),
        'count_emojis': count_emojis(tweet),
        'special_chars_count': count_special_characters(tweet),
        'Stop_Word_Frequency': stop_word_frequency(tweet),
        **get_linguistic_features(tweet),  # Include linguistic features
        'Readability_Score': readability_score(tweet),
        'URL_Frequency': get_url_frequency(tweet)  # Assuming you have the correct function for this
    }
    return features

# Extract features for all tweets
features_list = [extract_features(tweet) for tweet in X['text']]

# Create a Pandas DataFrame
X_new = pd.DataFrame(features_list)

# # Add the labels to the DataFrame
# df['Label'] = labels

# # Display the DataFrame
# print(df)


In [8]:
X_new

Unnamed: 0,Average_Word_Length,Lexical_Diversity,Capital_Letters_Count,Hashtag_Frequency,Mention_Frequency,count_emojis,special_chars_count,Stop_Word_Frequency,Noun_Count,Verb_Count,Participle_Count,Interjection_Count,Pronoun_Count,Preposition_Count,Adverb_Count,Conjunction_Count,Readability_Score,URL_Frequency
0,6.538462,1.000000,18,5,0,0,9,2,5,2,1,0,1,0,0,0,42.38,0
1,4.893617,0.808511,3,0,0,0,10,21,11,6,3,0,0,1,2,0,56.08,0
2,5.205128,0.923077,34,4,0,0,6,15,12,4,0,0,0,1,1,0,60.14,0
3,5.076923,1.000000,11,0,0,0,5,3,4,1,0,0,0,0,0,0,75.20,0
4,4.823529,0.941176,3,0,0,0,2,9,4,2,1,0,0,0,0,0,62.68,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8977,3.625000,1.000000,1,0,0,0,1,5,1,0,0,0,0,0,0,0,71.82,0
8978,4.466667,1.000000,1,0,0,0,1,7,6,0,0,0,0,0,0,0,64.71,0
8979,4.322581,0.903226,6,0,0,0,3,11,7,7,5,0,0,0,2,0,69.48,0
8980,6.333333,1.000000,1,0,0,0,0,0,1,0,0,0,0,1,0,0,9.21,0


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)


In [10]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.66

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.81      0.73      1042
           1       0.63      0.44      0.52       755

    accuracy                           0.66      1797
   macro avg       0.65      0.63      0.63      1797
weighted avg       0.65      0.66      0.64      1797



# For test dataset

In [11]:
df_test.head()

Unnamed: 0,text,labels
0,"Oh, I could have gone on about taxes. Since th...",0
1,Several of the wild fires in #california and #...,1
2,My question is how do you resettle a refugee a...,0
3,"#Europe, you've got a problem! We must hurry a...",1
4,This is outrageous! #StopIllegalImmigration #M...,1


In [12]:
features_list_test = [extract_features(tweet) for tweet in df_test['text']]
X_test_new = pd.DataFrame(features_list_test)
X_test_scaled = scaler.transform(X_test_new)
y_test_pred = model.predict(X_test_scaled)
accuracy_test = accuracy_score(df_test['labels'], y_test_pred)
print(f"Test Accuracy: {accuracy_test:.2f}")
print("\nTest Classification Report:")
print(classification_report(df_test['labels'], y_test_pred))

Test Accuracy: 0.64

Test Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.58      0.54      1099
           1       0.73      0.67      0.70      1901

    accuracy                           0.64      3000
   macro avg       0.62      0.62      0.62      3000
weighted avg       0.65      0.64      0.64      3000

