In [1]:
import pandas as pd
from datasets import load_dataset

dataset_name = "krishan-CSE/HatEval-Relabeled"
dataset = load_dataset(dataset_name)

df_train = dataset['train'].to_pandas()
df_valid = dataset['validation'].to_pandas()
df_test = dataset['test'].to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(df_train.shape)
print("train value_counts:\n", df_train['labels'].value_counts())
print(df_train.head())
print("=========================================="
      "==========================================")
print(df_valid.shape)
print("validation value_counts:\n", df_valid['labels'].value_counts())
print(df_valid.head())
print("=========================================="
      "==========================================")
print(df_test.shape)
print("test value_counts:\n", df_test['labels'].value_counts())
print(df_test.head())

(9088, 2)
train value_counts:
 labels
0    4811
1    4277
Name: count, dtype: int64
                                                text  labels
0  This human-elephant conflict has seen 13 refug...       0
1  The awkward moment when Lexus is showing you h...       0
2            People- why are you so fucking mean Me-       0
3  After EU uses Turkey as buffer to stop refugee...       0
4                           Immigration in a picture       0
(1168, 2)
validation value_counts:
 labels
0    618
1    550
Name: count, dtype: int64
                                                text  labels
0  President Jokowi: it's not true millions of Ch...       0
1  So you created the problem by mass immigration...       1
2  I though in a free country you could worship w...       0
3  WELP. Bitch IM JUST NOW FUCKING SEEING DUMB WHORE       1
4  .Considering THIS , the filth on the streets o...       1
(2724, 2)
test value_counts:
 labels
0    1442
1    1282
Name: count, dtype: int64
              

In [3]:
# Modfied from code by Kwan-Yuet (Stephen) Ho:
# https://datawarrior.wordpress.com/2016/03/29/flesch-kincaid-readability-measure/
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import cmudict
import nltk
# from common import getCorpusJson
# from features.feature import Feature
nltk.download('cmudict', quiet=True)
nltk.download('punkt', quiet=True)


# class FleschKincaid(Feature):
def FleschKincaid(text: str) -> float:
  word_count, sent_count, syllable_count = text_statistics(text)
  return fk_formula(word_count, sent_count, syllable_count)


# class Flesch(Feature):
def Flesch(text: str) -> float:
  word_count, sent_count, syllable_count = text_statistics(text)
  return flesch_formula(word_count, sent_count, syllable_count)


# class WordCount(Feature):
def WordCount(text: str) -> float:
  return float(get_word_count(text))


# class SentenceCount(Feature):
def SentenceCount(text: str) -> float:
  return float(get_sent_count(text))


# class SyllableCount(Feature):
def SyllableCount(text: str) -> float:
  return float(get_syllable_count(text))

# private
def not_punctuation(w): return not (len(w) == 1 and (not w.isalpha()))

# private
def get_word_count(text: str):
  return len(
      list(filter(not_punctuation, word_tokenize(text))))

# private
def get_sent_count(text): return len(sent_tokenize(text))


prondict = cmudict.dict()


def numsyllables_pronlist(l): return len(
    list(filter(lambda s: (s[-1]).isdigit(), l)))


def numsyllables(word):
  try:
    return list(set(map(numsyllables_pronlist, prondict[word.lower()])))
  except KeyError:
    return [0]

# private
def text_statistics(text):
  word_count = get_word_count(text)
  sent_count = get_sent_count(text)
  syllable_count = sum(
      map(
          lambda w: max(
              numsyllables(w)),
          word_tokenize(text)))
  return word_count, sent_count, syllable_count


def get_syllable_count(text):
  return sum(
      map(
          lambda w: max(
              numsyllables(w)),
          word_tokenize(text)))

# private
def flesch_formula(word_count, sent_count, syllable_count): return 206.835 - \
    1.015 * word_count / sent_count - 84.6 * syllable_count / word_count

# private
def fk_formula(word_count, sent_count, syllable_count): return 0.39 * \
    word_count / sent_count + 11.8 * syllable_count / word_count - 15.59


def syllablesPerWord(text):
  word_count, sent_count, syllable_count = text_statistics(text)
  return syllable_count / word_count


def avgSenLength(text):
  word_count, sent_count, syllable_count = text_statistics(text)
  return word_count / sent_count


def wordCount(text):
  word_count, sent_count, syllable_count = text_statistics(text)
  return word_count


def sent_count(text):
  word_count, sent_count, syllable_count = text_statistics(text)
  return sent_count


def syllable_count(text):
  word_count, sent_count, syllable_count = text_statistics(text)
  return syllable_count

## Testing the features

In [4]:
tweet = "Excited for the conference! @user1@user4, @user2, and @user3 will be presenting. #ConferenceTime"
print("FlenshKincaid:", FleschKincaid(tweet))
print("Flesch:", Flesch(tweet))
print("WordCount:", WordCount(tweet))
print("SentenceCount:", SentenceCount(tweet))
print("SyllableCount:", SyllableCount(tweet))
print("syllablesPerWord:", syllablesPerWord(tweet))
print("avgSenLength:", avgSenLength(tweet))
print("wordCount:", wordCount(tweet))
print("sent_count:", sent_count(tweet))
print("syllable_count:", syllable_count(tweet))


FlenshKincaid: -1.1923076923076916
Flesch: 111.32897435897436
WordCount: 13.0
SentenceCount: 3.0
SyllableCount: 14.0
syllablesPerWord: 1.0769230769230769
avgSenLength: 4.333333333333333
wordCount: 13
sent_count: 3
syllable_count: 14


# Training

In [5]:
# !pip install scikit-learn textblob nltk

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Extract features and target variable
X = df_train.drop('labels', axis=1)
y = df_train['labels']

In [7]:
y

0       0
1       0
2       0
3       0
4       0
       ..
9083    1
9084    1
9085    0
9086    1
9087    1
Name: labels, Length: 9088, dtype: int64

In [None]:
import pandas as pd

# Define a function to extract features from a single tweet
def extract_features(tweet):
    features = {
        'FleschKincaid': FleschKincaid(tweet),
        'Flesch': Flesch(tweet),
        'WordCount': WordCount(tweet),
        'SentenceCount': SentenceCount(tweet),
        'SyllableCount': SyllableCount(tweet),
        'syllablesPerWord': syllablesPerWord(tweet),
        'avgSenLength': avgSenLength(tweet),
        'wordCount': wordCount(tweet),
        'sent_count': sent_count(tweet),
        'syllable_count': syllable_count(tweet)
    }
    return features

# Extract features for all tweets
features_list = [extract_features(tweet) for tweet in X['text']]

# Create a Pandas DataFrame
X_new = pd.DataFrame(features_list)

# # Add the labels to the DataFrame
# df['Label'] = labels

# # Display the DataFrame
# print(df)


In [9]:
X_new

Unnamed: 0,FleschKincaid,Flesch,WordCount,SentenceCount,SyllableCount,syllablesPerWord,avgSenLength,wordCount,sent_count,syllable_count
0,6.725263,80.686842,19.0,1.0,24.0,1.263158,19.000000,19,1,24
1,5.616471,85.074118,17.0,1.0,21.0,1.235294,17.000000,17,1,21
2,-2.145000,124.690000,8.0,1.0,7.0,0.875000,8.000000,8,1,7
3,19.004444,38.960000,45.0,1.0,65.0,1.444444,45.000000,45,1,65
4,9.570000,33.575000,4.0,1.0,8.0,2.000000,4.000000,4,1,8
...,...,...,...,...,...,...,...,...,...,...
9083,8.895238,68.691429,21.0,1.0,29.0,1.380952,21.000000,21,1,29
9084,-0.093333,117.105000,12.0,1.0,11.0,0.916667,12.000000,12,1,11
9085,5.141765,78.384608,34.0,3.0,47.0,1.382353,11.333333,34,3,47
9086,3.838095,104.948571,21.0,1.0,20.0,0.952381,21.000000,21,1,20


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)


In [11]:
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.59

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.65      0.62       948
           1       0.58      0.52      0.55       870

    accuracy                           0.59      1818
   macro avg       0.59      0.58      0.58      1818
weighted avg       0.59      0.59      0.58      1818



# For test dataset

In [12]:
df_test.head()

Unnamed: 0,text,labels
0,We have got to get these Obama DACA illegal al...,1
1,The same bitch is all on my boos shit like gir...,0
2,BS WILSON IS A SKANK WHORE AND A LIAR . DIDDN'...,1
3,Immigration Expert: Trudeau Has Lost Track Of ...,1
4,I like to delete comments that say 'first' to ...,0


In [13]:
features_list_test = [extract_features(tweet) for tweet in df_test['text']]
X_test_new = pd.DataFrame(features_list_test)
X_test_scaled = scaler.transform(X_test_new)
y_test_pred = model.predict(X_test_scaled)
accuracy_test = accuracy_score(df_test['labels'], y_test_pred)
print(f"Test Accuracy: {accuracy_test:.2f}")
print("\nTest Classification Report:")
print(classification_report(df_test['labels'], y_test_pred))

Test Accuracy: 0.56

Test Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.61      0.59      1442
           1       0.53      0.49      0.51      1282

    accuracy                           0.56      2724
   macro avg       0.55      0.55      0.55      2724
weighted avg       0.55      0.56      0.55      2724

