In [1]:
# Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

pd.set_option("mode.copy_on_write", True)

In [2]:
# Downloading the datasets
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...


True

In [3]:
# Importing the dataset
df = pd.read_csv("../../01_Data/01_Raw/raw_tweets.csv")
df.sample(5)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
167,169,3,0,3,0,1,"""@LODYCASH: dry pussy bitches always blame it ..."
1136,1161,9,1,8,0,1,"&#8220;@Benkasso: I'll beat the pussy up, that..."
9929,10201,3,1,2,0,1,"How my bestf gon call me "" People "" like ima r..."
22647,23125,3,0,3,0,1,What's up with all these bitches in bed clothe...
14372,14715,3,0,2,1,1,RT @BigDaddyTise: &#8220;@Mr_popular: Bald hea...


In [4]:
# Dropping the columns that are not needed
df = df[["class", "tweet"]]
df.sample(5)

Unnamed: 0,class,tweet
12455,1,Lil cuz need to help a bitch out
3160,1,@Fapplebee dude tell me why that's so true my ...
20187,1,"RT @tic14tac: Guys be like ""Man, fuck that bit..."
22532,1,We need to start having a fantasy team of UT h...
21276,2,Taking out a second mortgage and betting it al...


# Data Cleaning Process Starts Here

In [5]:
# create helper colums for cleaning
df["clean_tweet"] = df["tweet"].astype(str)

In [6]:
# Convert all the text to lower case
df["clean_tweet"] = df["clean_tweet"].str.lower()

In [7]:
# removing URLS
def remove_URL(text):
    text = re.sub(r"http\S+", "", text)
    return text


df["clean_tweet"] = df["clean_tweet"].str.replace(r"http\S+", "")

In [8]:
# removing mentions
def remove_mentions(text):
    return re.sub(r"@\w+", "", text)


df["clean_tweet"] = df["clean_tweet"].apply(remove_mentions)

In [9]:
# removing hashtags
def remove_hashtags(text):
    return re.sub(r"#\w+", "", text)


df["clean_tweet"] = df["clean_tweet"].apply(remove_hashtags)

In [10]:
# removing special characters and numbers
spl_chrs = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"


def remove_spl_chrs(text):
    text = "".join([_ for _ in text if _ not in spl_chrs])
    text = re.sub("[0-9]+", "", text)
    return text


df["clean_tweet"] = df["clean_tweet"].apply(lambda x: remove_spl_chrs(x))

In [11]:
# removing stopwords
stop = stopwords.words("english")
df["clean_tweet"] = df["clean_tweet"].apply(
    lambda x: " ".join([word for word in x.split() if word not in (stop)])
)

In [12]:
# removing retweet
def remove_rt(text):
    return re.sub("^[rt]+", "", text)


df["clean_tweet"] = df["clean_tweet"].apply(remove_rt)

In [13]:
df["tokenized"] = df["clean_tweet"].apply(word_tokenize)

In [14]:
df.sample(5)

Unnamed: 0,class,tweet,clean_tweet,tokenized
3197,1,"@FreddieGibbs Lol, always with the complaining...",lol always complaining crying like jeezy broke...,"[lol, always, complaining, crying, like, jeezy..."
8925,1,Eat her pussy when she mad\nEat her pussy when...,eat pussy mad eat pussy sad eat pussy sleep ea...,"[eat, pussy, mad, eat, pussy, sad, eat, pussy,..."
1542,0,&#8220;@TP_Three: @WestSideFlee @KekePalmer da...,damn got wifey fall back nigga dont share hoes,"[damn, got, wifey, fall, back, nigga, dont, sh..."
3231,2,"@GWDrums @samzbikowski Oh, wait! *""can I get t...",oh wait get photo playin creeds side project b...,"[oh, wait, get, photo, playin, creeds, side, p..."
13162,1,My worse bitch looks better than your main bit...,worse bitch looks better main bitch thats bad ...,"[worse, bitch, looks, better, main, bitch, tha..."


# Data Stemming and Lematization

In [15]:
# Stemming
ps = nltk.PorterStemmer()


def stemming(text):
    text = [ps.stem(word) for word in text]
    return text


df["tokenized"] = df["tokenized"].apply(lambda x: stemming(x))

In [16]:
# Lemmatization
wn = nltk.WordNetLemmatizer()


def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text


df["tokenized"] = df["tokenized"].apply(lambda x: lemmatizer(x))

# Data Selection

In [17]:
# Dropping category 0 from the dataset since it has low values as seen in the EDA
df = df[df["class"] != 0]

In [18]:
df["Length"] = df["tokenized"].apply(len)

In [19]:
# Dropping tweets with 0 length
df = df[df["Length"] != 0]

In [20]:
# Generating simple label for easy identification
df["offensive"] = df["class"].apply(lambda x: "Yes" if x == 1 else "No")

In [21]:
df.sample(5)

Unnamed: 0,class,tweet,clean_tweet,tokenized,Length,offensive
19812,1,RT @oweeSheREDD: you tell ya business to ah bi...,tell ya business ah bitch cant trust ya,"[tell, ya, busi, ah, bitch, cant, trust, ya]",8,Yes
19483,1,RT @kylegotjokes: Me when your bitch favorites...,bitch favorites tweets httptcomaokzqlaos,"[bitch, favorit, tweet, httptcomaokzqlao]",4,Yes
7784,1,Any nicca gettin it n da ass is a fag! &#8220;...,nicca gettin n da ass fag howw please explain ...,"[nicca, gettin, n, da, as, fag, howw, plea, ex...",13,Yes
6822,1,@peighxo @baestation bitch got arms like piccolo,bitch got arms like piccolo,"[bitch, got, arm, like, piccolo]",5,Yes
10089,1,I broke baby outta his thug stage n turned him...,broke baby outta thug stage n turned pussy har...,"[broke, babi, outta, thug, stage, n, turn, pus...",13,Yes


In [22]:
# dropping extra columns
df2 = df[["offensive", "class", "tokenized", "clean_tweet"]]

In [23]:
# View Distribution of the classes
df2["offensive"].value_counts()

offensive
Yes    19184
No      4160
Name: count, dtype: int64

In [24]:
# create a new random sample with 4000 rows from each class
df3 = (
    df2.groupby("offensive")
    .apply(lambda x: x.sample(n=4000, random_state=42))
    .reset_index(drop=True)
)

In [25]:
df3["offensive"].value_counts()

offensive
No     4000
Yes    4000
Name: count, dtype: int64

In [26]:
df3.sample(5)

Unnamed: 0,offensive,class,tokenized,clean_tweet
1729,No,2,"[run, differentialsmet, yanke]",run differentialsmets yankees
3427,No,2,"[would, spend, money, trash, like]",would spend money trash like
1616,No,2,"[leav, whine, infrastructur, r, one, wont, pay...",leave whine infrastructure r ones wont pay wai...
6787,Yes,1,"[witter, say, ridin, niggah, that, realli, fuc...",witter say ridin niggah thats really fuck pull...
7145,Yes,1,"[lil, bitch, bu, stop, adida, flipflop, amp, s...",lil bitch bus stop adidas flipflops amp socks ...


In [27]:
# Splitting the dataset into train and test sets
train, test = train_test_split(
    df3, test_size=0.20, stratify=df3["offensive"], random_state=42
)

In [28]:
train["offensive"].value_counts()

offensive
Yes    3200
No     3200
Name: count, dtype: int64

In [29]:
test["offensive"].value_counts()

offensive
Yes    800
No     800
Name: count, dtype: int64

In [30]:
# Saving the datasets
train.to_csv("../../01_Data/02_Processed/train.csv", index=False)
test.to_csv("../../01_Data/02_Processed/test.csv", index=False)