In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('maxent_ne_chunker_tab')
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import string
import re
import os

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker_tab.zip.


In [None]:
import kagglehub

path = kagglehub.dataset_download("edwardombui/hatespeech-kenya")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/edwardombui/hatespeech-kenya?dataset_version_number=1...


100%|██████████| 2.19M/2.19M [00:00<00:00, 35.2MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/edwardombui/hatespeech-kenya/versions/1





In [None]:
files = os.listdir(path)
print("Downloaded files:", files)

Downloaded files: ['HateSpeech_Kenya.csv']


In [None]:
csv_file = [f for f in files if f.endswith(".csv")][0]
csv_path = os.path.join(path, csv_file)

Loading the data

In [None]:
df=pd.read_csv(csv_path)

Understanding the structure of the data


In [None]:
df.shape

(48076, 5)

In [None]:
df.head()

Unnamed: 0,hate_speech,offensive_language,neither,Class,Tweet
0,0,0,3,0,['The political elite are in desperation. Ordi...
1,0,0,3,0,"[""Am just curious the only people who are call..."
2,0,0,3,0,['USERNAME_3 the area politicians are the one ...
3,0,0,3,0,['War expected in Nakuru if something is not d...
4,0,0,3,0,['USERNAME_4 tells kikuyus activists that they...


In [None]:

df.dtypes

Unnamed: 0,0
hate_speech,int64
offensive_language,int64
neither,int64
Class,int64
Tweet,object


Preprocessing

In [None]:
#Necessary conversions to use string methods
df['Tweet']=df['Tweet'].astype(str)

In [None]:
tweets=df['Tweet']

In [None]:
stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()

In [None]:
print(tweets)

0        ['The political elite are in desperation. Ordi...
1        ["Am just curious the only people who are call...
2        ['USERNAME_3 the area politicians are the one ...
3        ['War expected in Nakuru if something is not d...
4        ['USERNAME_4 tells kikuyus activists that they...
                               ...                        
48071    ['This is exactly what Moses Kuria & ilk are d...
48072    ['This is exactly what Kenyans are going throu...
48073    ["This is exactly what is wrong with this coun...
48074    ["This is exactly the same thing. Well the onl...
48075    ['This is exactly how Luos feel in this countr...
Name: Tweet, Length: 48076, dtype: object


In [None]:
#lowercase
tweets = tweets.apply(lambda tweet: tweet.lower())

In [None]:
def remove_punctuation(x):
  lemmatizer.lemmatize(x)
  return re.sub(r'[^\w\s]', '', x)

def tokenize(txt):
  tokenized_tweet=nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(txt)))
  return tokenized_tweet

def remove_stops(txt):
  removed=[word for word in txt if word not in stop_words]
  return " ".join(removed)

In [None]:
nopunc_tweets=tweets.apply(lambda tweet:remove_punctuation(tweet))

In [None]:
t_tweets = nopunc_tweets.apply(lambda tweet: tokenize(tweet))

In [None]:
nostops_tweets = t_tweets.apply(lambda tweet:remove_stops(tweet))

In [None]:
tagged_tweets = nostops_tweets.apply(lambda tweet: nltk.pos_tag(nltk.word_tokenize(tweet)))

In [None]:
print(tagged_tweets)

In [None]:

preprocessed=[]

stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()
for tweet in tweets:
  tokens=nltk.word_tokenize(tweet)
for t in tokens:
    #lower
    t_low=t.lower()


    #remove stop words
    if t_low in stop_words:
      tokens.remove(t)
    t=t.lower()
    #removing punctuation marks
    re.findall(r"\w+", t)
    t=lemmatizer.lemmatize(t)
tokens=nltk.pos_tag(tokens)
tokens=nltk.ne_chunk(tokens)
preprocessed.append(tokens)
print(preprocessed[1])

In [None]:

preprocessed_strings = [' '.join([token for token, pos in tweet]) for tweet in preprocessed]
df['Tweet']=preprocessed

In [None]:

df['Tweet'].head()

In [None]:
df.head()

In [None]:

#dropping unnecessary columns
df=df.drop(['hate_speech', 'offensive_language', 'neither'],axis=1)


In [None]:

df.head()

In [None]:

#reducing classes to hatespeech and not hatespeech
df['Class'].replace(1,0,inplace=True)
df['Class'].replace(2,1,inplace=True)

In [None]:

#feature extraction
df['Tweet']=df['Tweet'].astype(str)
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['Tweet'])

In [None]:

#declaring the feature and target
x=X
y=df['Class']

In [None]:

x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.25, random_state=42)

In [None]:

classifier=LogisticRegression()
classifier.fit(x_train, y_train)

In [None]:

prediction=classifier.predict(x_test)

In [None]:

accuracy = accuracy_score(prediction, y_test)
report = classification_report(prediction, y_test)
conf_matrix = confusion_matrix(prediction, y_test)

print("Accuracy score:", accuracy)
print(" Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)