In [1]:
from __future__ import print_function

import pandas as pd 
import numpy as np 
import sklearn

# NLTK/NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk import FreqDist, word_tokenize
import string, re
import urllib
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.collocations import *
import gensim

# Classifiers 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split


#Sampling
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

import sklearn.decomposition as decomposition

#Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

Using TensorFlow backend.


In [2]:
%run custom_functions.ipynb

In [8]:
df=pd.read_csv('data/dataset2.csv')
df.drop(['Unnamed: 0'], axis=1, inplace = True)
df.head(10)

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


In [9]:
df.shape

(24783, 6)

In [10]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt    

In [11]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")


In [12]:
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")


In [13]:
df['tidy_tweet']= df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [14]:
df['no_hash_tweet']= df['tidy_tweet'].str.replace("#", "")

In [15]:
df['tokenized_tweet'] = df['no_hash_tweet'].apply(lambda x: x.split())
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...,"[woman, shouldn, complain, about, cleaning, yo..."
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,dats cold tyga cuffin place,dats cold tyga cuffin place,"[dats, cold, tyga, cuffin, place]"
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg ever fuck bitch start confused shit,Dawg ever fuck bitch start confused shit,"[Dawg, ever, fuck, bitch, start, confused, shit]"
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,look like tranny,"[look, like, tranny]"
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear about might true might faker than bi...,shit hear about might true might faker than bi...,"[shit, hear, about, might, true, might, faker,..."


In [16]:
stemmer = SnowballStemmer("english")
df['stemmed_tokens'] = df.tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
df.head()


Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...,"[woman, shouldn, complain, about, cleaning, yo...","[woman, shouldn, complain, about, clean, your,..."
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,dats cold tyga cuffin place,dats cold tyga cuffin place,"[dats, cold, tyga, cuffin, place]","[dat, cold, tyga, cuffin, place]"
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg ever fuck bitch start confused shit,Dawg ever fuck bitch start confused shit,"[Dawg, ever, fuck, bitch, start, confused, shit]","[dawg, ever, fuck, bitch, start, confus, shit]"
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,look like tranny,"[look, like, tranny]","[look, like, tranni]"
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear about might true might faker than bi...,shit hear about might true might faker than bi...,"[shit, hear, about, might, true, might, faker,...","[shit, hear, about, might, true, might, faker,..."


In [17]:
df['lemmatized_tokens'] = df.tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x]) # lemmatizing
# [lemmatizer.lemmatize(word) for word in df.no_hash_tweet]
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...,"[woman, shouldn, complain, about, cleaning, yo...","[woman, shouldn, complain, about, clean, your,...","[woman, shouldn, complain, about, cleaning, yo..."
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,dats cold tyga cuffin place,dats cold tyga cuffin place,"[dats, cold, tyga, cuffin, place]","[dat, cold, tyga, cuffin, place]","[dat, cold, tyga, cuffin, place]"
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg ever fuck bitch start confused shit,Dawg ever fuck bitch start confused shit,"[Dawg, ever, fuck, bitch, start, confused, shit]","[dawg, ever, fuck, bitch, start, confus, shit]","[Dawg, ever, fuck, bitch, start, confused, shit]"
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,look like tranny,"[look, like, tranny]","[look, like, tranni]","[look, like, tranny]"
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear about might true might faker than bi...,shit hear about might true might faker than bi...,"[shit, hear, about, might, true, might, faker,...","[shit, hear, about, might, true, might, faker,...","[shit, hear, about, might, true, might, faker,..."


In [18]:
df.stemmed_tokens[0][0]

'woman'

In [19]:
df['lem_tweet'] = [lemmatizer.lemmatize(word) for word in df.no_hash_tweet]

In [20]:
df['stem_tweet'] = [stemmer.stem(word) for word in df.no_hash_tweet]

In [21]:
df.head()

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...,woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...,"[woman, shouldn, complain, about, cleaning, yo...","[woman, shouldn, complain, about, clean, your,...","[woman, shouldn, complain, about, cleaning, yo...",woman shouldn complain about cleaning your hou...,woman shouldn complain about cleaning your hou...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,dats cold tyga cuffin place,dats cold tyga cuffin place,"[dats, cold, tyga, cuffin, place]","[dat, cold, tyga, cuffin, place]","[dat, cold, tyga, cuffin, place]",dats cold tyga cuffin place,dats cold tyga cuffin plac
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,Dawg ever fuck bitch start confused shit,Dawg ever fuck bitch start confused shit,"[Dawg, ever, fuck, bitch, start, confused, shit]","[dawg, ever, fuck, bitch, start, confus, shit]","[Dawg, ever, fuck, bitch, start, confused, shit]",Dawg ever fuck bitch start confused shit,dawg ever fuck bitch start confused shit
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,look like tranny,look like tranny,"[look, like, tranny]","[look, like, tranni]","[look, like, tranny]",look like tranny,look like tranni
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,shit hear about might true might faker than bi...,shit hear about might true might faker than bi...,"[shit, hear, about, might, true, might, faker,...","[shit, hear, about, might, true, might, faker,...","[shit, hear, about, might, true, might, faker,...",shit hear about might true might faker than bi...,shit hear about might true might faker than bi...


In [26]:
df.columns

Index(['count', 'hate_speech', 'offensive_language', 'neither', 'class',
       'tweet', 'tidy_tweet', 'no_hash_tweet', 'tokenized_tweet',
       'stemmed_tokens', 'lemmatized_tokens', 'lem_tweet', 'stem_tweet'],
      dtype='object')

In [30]:
df['class'].value_counts(normalize=True)

1    0.774321
2    0.167978
0    0.057701
Name: class, dtype: float64

In [None]:
df_0=df[df['label']==0]
df_0.head()

In [None]:
df_1= df[df['label']==1]
df_1.head()

In [None]:
df.head()

In [None]:
df.lem_tweet = df.lem_tweet.apply(str)

In [23]:
df.stem_tweet = df.stem_tweet.apply(str)

In [24]:
df.to_csv('data/data2-cleaned.csv')

In [25]:
# df=pd.read_csv('data/cleaned-reshuffled.csv')
# df.drop(['Unnamed: 0'], axis = 1, inplace= True)

## Data Visualization

In [None]:
# from PIL import Image
# cloud_mask = np.array(Image.open("twitter.png"))
# cloud_mask

In [None]:
from wordcloud import WordCloud

df_0_words = ' '.join([text for text in df['tidy_tweet'][df['label']==0]])
wordcloud = WordCloud(width=800, height=500, random_state=10, max_font_size=110).generate(df_0_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df_0_words

In [None]:
df_1_words = ' '.join([text for text in df['tidy_tweet'][df['label']==1]])

wordcloud = WordCloud(width=800, height=500, random_state=210, max_font_size=110).generate(df_1_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df_1_words

In [None]:
# function to collect hashtags
def hashtag_extract(tweet):
    hashtags = []
    # Loop over the words in the tweet
    for word in tweet:
        ht = re.findall(r"#(\w+)", word)
        hashtags.append(ht)

    return hashtags

In [None]:
# extracting hashtags from non racist/sexist tweets
HT_0 = hashtag_extract(df['tidy_tweet_2'][df['label']==0])

# extracting hashtags from racist/sexist tweets
HT_1 = hashtag_extract(df['tidy_tweet_2'][df['label']==1])


In [None]:
HT_0

In [None]:
# unnesting lists
HT_0 = sum(HT_0,[])
HT_1 = sum(HT_1,[])

In [None]:
HT_0

In [None]:
a = nltk.FreqDist(HT_0)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
b = nltk.FreqDist(HT_1)
e = pd.DataFrame({'Hashtag': list(b.keys()), 'Count': list(b.values())})
# selecting top 10 most frequent hashtags
e = e.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
meta_freqdist = FreqDist(HT_1)
meta_freqdist.most_common(10)

In [None]:
meta_freqdist.plot(10,cumulative=False)

In [None]:
meta_freqdist = FreqDist(HT_0)
meta_freqdist.most_common(10)

In [None]:
meta_freqdist.plot(10,cumulative=False)

## Bigram

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
meta_finder = BigramCollocationFinder.from_words(df['no_hash_tweet'])


In [None]:
bigram_scored = meta_finder.score_ngrams(bigram_measures.raw_freq)


In [None]:
bigram_scored