In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

import plotly
from plotly import graph_objs 
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

In [2]:
# Load data into DataFrame 
df = pd.read_csv('labeled_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...


In [3]:
"""Data Preprocessing Steps : 1. First column is unnecessary and can be removed
2. Check the number of records and null values, if any
"""
df= df.drop("Unnamed: 0", axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24783 entries, 0 to 24782
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   count               24783 non-null  int64 
 1   hate_speech         24783 non-null  int64 
 2   offensive_language  24783 non-null  int64 
 3   neither             24783 non-null  int64 
 4   class               24783 non-null  int64 
 5   tweet               24783 non-null  object
dtypes: int64(5), object(1)
memory usage: 1.1+ MB


In [4]:
"""Checking if data is balanced between the 3 classes - hate_speech, offensive and neutral"""

hate = len(df[df['class'] == 0])
off = len(df[df['class'] == 1])
neu = len(df[df['class'] == 2])
dist = [
    graph_objs.Bar(
        x=["hate_speech","offensive_language","neutral"],
        y=[hate, off, neu],
)]
plotly.offline.iplot({"data":dist, "layout":graph_objs.Layout(title="Class Distribution")})


In [6]:
df['class'].unique()

array([2, 1, 0])

In [7]:
df = df.rename(columns={"class": "target"})

In [8]:
def preprocess_tweet(df, col):
    """
        Remove callouts, character references (HTML characters, emojis), # in hashtags, 
        Remove Twitter code RT and QT, URL links, punctuation, excess whitespace between
        Lowercase all words and remove leading and trailing whitespaces
    """
    df[col] = df[col].apply(lambda x: re.sub(r'@[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'&[\S]+?;', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'#', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'(\bRT\b|\bQT\b)', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'http[\S]+', ' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'[^\w\s]', r'', str(x)))
    df[col] = df[col].apply(lambda x: " ".join(x.lower() for x in x.split()))
    df[col] = df[col].apply(lambda x: re.sub(r'\w*\d\w*', r' ', str(x)))
    df[col] = df[col].apply(lambda x: re.sub(r'\s\s+', ' ', str(x)))

In [9]:
preprocess_tweet(df, 'tweet')
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,target,tweet
0,3,0,0,3,2,as a woman you shouldnt complain about cleanin...
1,3,0,3,0,1,boy dats coldtyga dwn bad for cuffin dat hoe i...
2,3,0,3,0,1,dawg you ever fuck a bitch and she start to cr...
3,3,0,2,1,1,she look like a tranny
4,6,0,6,0,1,the shit you hear about me might be true or it...


In [11]:
from sklearn.model_selection import train_test_split
#Train Test Split
X = df.tweet
y = df.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
train = pd.concat([X_train, y_train], axis=1).reset_index()
train = train.drop(columns=['index'], axis=1)
train.head()

Unnamed: 0,tweet,target
0,well how else will white ppl get us to forget ...,0
1,funny thing isits not just the people doing it...,2
2,nigga messed with the wrong bitch,1
3,bitch ass nigggaaa,1
4,so that real bitch,1


In [13]:
test = pd.concat([X_test, y_test], axis=1).reset_index()
test = test.drop(columns=['index'], axis=1)
test.head()

Unnamed: 0,tweet,target
0,i got a missed call from yo bitch,1
1,fucking with a bad bitch you gone need some mo...,1
2,lol my credit aint no where near good but i kn...,2
3,wipe the cum out of them faggot contact lens i...,1
4,niggas cheat on they bitch and dont expect no ...,1


In [22]:
#stop word removal- This will have to be moved along with the following 2 util functionw
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/manveerkaur/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/manveerkaur/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [23]:
def tokenize(df, col):
    """
        Function to tokenize column of strings without punctuation
        Input into word_tokenize() must be string with spaces only
        Output is a list of tokenized words
    """
    text = ' '.join(df[col].to_list())
    tokens = nltk.word_tokenize(text)
    return tokens

In [25]:
def no_stopwords(text):
    lst = [word for word in text if word not in stop_words]
    return lst

In [26]:
stop_words = set(stopwords.words('english'))
stop_list = [''.join(c for c in s if c not in string.punctuation) for s in stop_words]

train.tweet = train.tweet.apply(lambda x: re.sub(r'\b\w{1,2}\b', '', str(x)))
test.tweet = test.tweet.apply(lambda x: re.sub(r'\b\w{1,2}\b', '', str(x)))

train_tokens = tokenize(train, 'tweet')
test_tokens = tokenize(test, 'tweet')
train_tokenz = no_stopwords(train_tokens)
test_tokenz = no_stopwords(test_tokens)

In [30]:
"""Trying something very preliminary - this might not make it to the final project
Going to use one hot encoding to convert tweets into encoded vectors


"""

def encode_tweets(df,col,vocab_size):
    df[col]= df[col].apply(lambda x: one_hot(x,vocab_size))

In [32]:
vocab_size = 50
encode_tweets(train, 'tweet', vocab_size)

In [33]:
train.head()

Unnamed: 0,tweet,target
0,"[23, 24, 3, 4, 35, 9, 37, 16, 31, 29, 42, 24, ...",0
1,"[19, 14, 48, 36, 21, 10, 48, 39, 1, 10, 48, 24...",2
2,"[8, 32, 3, 10, 35, 49]",1
3,"[49, 4, 25]",1
4,"[41, 15, 49]",1


In [34]:
encode_tweets(test, 'tweet', vocab_size)

In [35]:
#Finding the maximum length of tweet
for column in train:
    print(column,"->", df[column].astype(str).str.len().max())

tweet -> 140
target -> 1


In [36]:
max_length = 140
padded_tweets= pad_sequences(train['tweet'], maxlen= max_length, padding= 'post')


In [38]:
padded_tweets

array([[23, 24,  3, ...,  0,  0,  0],
       [19, 14, 48, ...,  0,  0,  0],
       [ 8, 32,  3, ...,  0,  0,  0],
       ...,
       [14, 49,  6, ...,  0,  0,  0],
       [21, 28, 32, ...,  0,  0,  0],
       [13, 36, 35, ...,  0,  0,  0]], dtype=int32)

In [41]:
train.shape

(19826, 2)

In [44]:
len(padded_tweets)

19826

In [45]:
embedded_vector_size= len(padded_tweets)
model= Sequential()
model.add(Embedding(vocab_size, embedded_vector_size, input_length=max_length, name = 'embedding'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

Metal device set to: Apple M1


In [46]:
X= padded_tweets
y = train['target']

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X,y,epochs = 50, verbose = 0)