In [1]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

pd.set_option("mode.copy_on_write", True)


In [2]:
#Downloading the datasets
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
#Importing the dataset
df = pd.read_csv('../../01_Data/01_Raw/raw_tweets.csv')
df.sample(5)

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
5014,5163,3,0,3,0,1,@Todd__Kincannon A+ Would expect nothing less ...
6503,6686,3,0,3,0,1,@king__saan I feel like I just got married bru...
1796,1833,6,1,5,0,1,&#8220;@itsDeSha__: I don't take people niggas...
12526,12838,3,2,1,0,0,Lmaooo RT @Handsomeesco_55 She a tranny if she...
8036,8261,3,0,3,0,1,Bitch I want my number back yo pussy mustard.....


In [4]:
#Dropping the columns that are not needed
df = df[['class', 'tweet']]
df.sample(5)

Unnamed: 0,class,tweet
23763,0,do you think all the rice fags are at McDonald's?
18323,1,"RT @_MoonWP_: When dykes hear ""ladies free bef..."
16370,1,RT @Manstagram_: This bitch went full on retar...
9866,1,Hoes will be hoes.
12419,1,Like a woman to tell you what she wants you to...


# Data Cleaning Process Starts Here

In [5]:
#create helper colums for cleaning
df['clean_tweet'] = df['tweet'].astype(str)

In [6]:
# Convert all the text to lower case
df['clean_tweet'] = df['clean_tweet'].str.lower()

In [7]:
#removing URLS
def remove_URL(text):
    text = re.sub(r'http\S+', '', text)
    return text
df['clean_tweet'] = df['clean_tweet'].str.replace(r"http\S+", "")

In [8]:
#removing mentions
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)
df['clean_tweet'] = df['clean_tweet'].apply(remove_mentions)

In [9]:
#removing hashtags
def remove_hashtags(text):
    return re.sub(r'#\w+', '', text)
df['clean_tweet'] = df['clean_tweet'].apply(remove_hashtags)

In [10]:
#removing special characters and numbers
spl_chrs =  '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def remove_spl_chrs(text):
        text  = "".join([_ for _ in text if _ not in spl_chrs])
        text = re.sub('[0-9]+', '', text)
        return text
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: remove_spl_chrs(x))

In [11]:
#removing stopwords
stop = stopwords.words('english')
df['clean_tweet'] = df['clean_tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [12]:
#removing retweet
def remove_rt(text):
    return re.sub('^[rt]+', '', text)
df['clean_tweet'] = df['clean_tweet'].apply(remove_rt)


In [13]:
df['tokenized'] = df["clean_tweet"].apply(word_tokenize)

In [14]:
df.sample(5)

Unnamed: 0,class,tweet,clean_tweet,tokenized
4758,0,@StephyRae7 faggot.,faggot,[faggot]
2655,1,@BosleyXavier I trip on acid not bitches.,ip acid bitches,"[ip, acid, bitches]"
3648,1,@JerGucci I can trap anywhere I wanna just bec...,ap anywhere wanna aint bitch n rarely solo,"[ap, anywhere, wan, na, aint, bitch, n, rarely..."
20863,1,Sis said go to sleep I said bitch it's our b d...,sis said go sleep said bitch b day aint sleeping,"[sis, said, go, sleep, said, bitch, b, day, ai..."
607,1,"""You can pull more pussy with a dodge and coll...",pull pussy dodge collect panties alright storm...,"[pull, pussy, dodge, collect, panties, alright..."


# Data Stemming and Lematization

In [15]:
#Stemming
ps = nltk.PorterStemmer()
def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

df["tokenized"] = df["tokenized"].apply(lambda x: stemming(x))

In [16]:
#Lemmatization
wn = nltk.WordNetLemmatizer()
def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text
df["tokenized"] = df["tokenized"].apply(lambda x: lemmatizer(x))

# Data Selection

In [17]:
#Dropping category 0 from the dataset since it has low values as seen in the EDA
df = df[df['class'] != 0]

In [18]:
df['Length'] = df['tokenized'].apply(len)

In [19]:
#Dropping tweets with 0 length
df = df[df['Length'] != 0]

In [20]:
#Generating simple label for easy identification
df['offensive'] = df['class'].apply(lambda x: 'Yes' if x == 1 else 'No')

In [21]:
df.sample(5)

Unnamed: 0,class,tweet,clean_tweet,tokenized,Length,offensive
7484,1,"Abby doesnt understand how beautiful she is, b...",abby doesnt understand beautiful bitch straigh...,"[abbi, doesnt, understand, beauti, bitch, stra...",14,Yes
205,2,"""@OSAY_it_aint_so: &#8220;@IgnoreAllLaws: Fost...",fosters home imaginary trash whoa chill show e...,"[foster, home, imaginari, trash, whoa, chill, ...",9,No
12841,1,MiMi was so worried bout Stevie J marriage tha...,mimi worried bout stevie j marriage bitch aint...,"[mimi, worri, bout, stevi, j, marriag, bitch, ...",15,Yes
22960,1,Woke up feeling like a bitch. Gonna stop actin...,woke feeling like bitch gonna stop acting like...,"[woke, feel, like, bitch, gon, na, stop, act, ...",15,Yes
9385,1,Gave dat bitch da Spirit of the Dragon,gave dat bitch da spirit dragon,"[gave, dat, bitch, da, spirit, dragon]",6,Yes


In [22]:
#dropping extra columns
df2 = df[['offensive', 'class', 'tokenized']]

In [23]:
#View Distribution of the classes
df2['offensive'].value_counts()

offensive
Yes    19184
No      4160
Name: count, dtype: int64

In [24]:
#create a new random sample with 4000 rows from each class
df3 = df2.groupby('offensive').apply(lambda x: x.sample(n=4000, random_state=42)).reset_index(drop = True)

In [25]:
df3['offensive'].value_counts()

offensive
No     4000
Yes    4000
Name: count, dtype: int64

In [26]:
df3.sample(5)

Unnamed: 0,offensive,class,tokenized
4089,Yes,1,"[girl, ive, thinkin, way, thinkin, new, way, b..."
2479,No,2,"[dont, like, trash, talk, either, way, fair, t..."
1024,No,2,"[femal, grand, champion, jess, heard, bird, ou..."
3260,No,2,"[oseann, realli, bum, show, woman, note, time,..."
5800,Yes,1,"[lmfao, rightrt, knew, sidechick, stop, hate, ..."


In [27]:
#Splitting the dataset into train and test sets
train, test = train_test_split(df3, test_size=0.20, stratify=df3['offensive'], random_state=42)

In [28]:
train['offensive'].value_counts()

offensive
Yes    3200
No     3200
Name: count, dtype: int64

In [29]:
test['offensive'].value_counts()

offensive
Yes    800
No     800
Name: count, dtype: int64

In [30]:
#Saving the datasets
train.to_csv('../../01_Data/02_Processed/train.csv', index=False)
test.to_csv('../../01_Data/02_Processed/test.csv', index=False)