In [1]:
from __future__ import print_function

import pandas as pd 
import numpy as np 
import sklearn

# NLTK/NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk import FreqDist, word_tokenize
import string, re
import urllib
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.collocations import *
import gensim

# Classifiers 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
from sklearn.model_selection import train_test_split


#Sampling
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.utils import resample

import sklearn.decomposition as decomposition

#Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

import warnings 
warnings.filterwarnings("ignore")

from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

Using TensorFlow backend.


In [2]:
%run custom_functions.ipynb

In [24]:
# df=pd.read_csv('train-file.csv')
# df = df.sample(frac=1).reset_index(drop=True)
# # df.to_csv('train-reshuffled.csv')

df=pd.read_csv('data/train-reshuffled.csv')

df.head(10)

Unnamed: 0,id,label,tweet
0,29727,0,sad to see the scenes of hooligans pre #engrus...
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...
3,18283,1,@user you might be a libtard if... #libtard #...
4,25845,0,what are your goals? find out here... #smile...
5,14297,0,retweets @user #nuascannan
6,14016,0,a classic trump follower.
7,7009,0,"in the mixture of emotions, here's a one fro..."
8,18926,0,@user the meps bear that travelled to sandy ho...
9,28292,0,just because it affected her son? sonâs ab...


In [25]:
df.shape

(31962, 3)

In [26]:
df_10=df[0:10]
df_10.head()

Unnamed: 0,id,label,tweet
0,29727,0,sad to see the scenes of hooligans pre #engrus...
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...
3,18283,1,@user you might be a libtard if... #libtard #...
4,25845,0,what are your goals? find out here... #smile...


In [27]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt    

In [28]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")


In [29]:
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")


In [34]:
df['tidy_tweet']= df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split()]))

In [35]:
df['no_hash_tweet']= df['tidy_tweet'].str.replace("#", "")

In [36]:
df['tokenized_tweet'] = df['no_hash_tweet'].apply(lambda x: x.split())
df.head()

Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet
0,29727,0,sad to see the scenes of hooligans pre #engrus...,sad see the scenes hooligans pre #engrus why s...,sad see the scenes hooligans pre engrus why sp...,"[sad, see, the, scenes, hooligans, pre, engrus..."
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...,#gooddyeyoung #yoyoyo super happy apa the move...,gooddyeyoung yoyoyo super happy apa the movement,"[gooddyeyoung, yoyoyo, super, happy, apa, the,..."
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...,queen evil bihday #lnic #lnicjustanevilbday #b...,queen evil bihday lnic lnicjustanevilbday bihd...,"[queen, evil, bihday, lnic, lnicjustanevilbday..."
3,18283,1,@user you might be a libtard if... #libtard #...,you might libtard #libtard #sjw #liberal #poli...,you might libtard libtard sjw liberal politics,"[you, might, libtard, libtard, sjw, liberal, p..."
4,25845,0,what are your goals? find out here... #smile...,what are your goals find out here #smile,what are your goals find out here smile,"[what, are, your, goals, find, out, here, smile]"


In [37]:
stemmer = SnowballStemmer("english")
df['stemmed_tokens'] = df.tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
df.head()


Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens
0,29727,0,sad to see the scenes of hooligans pre #engrus...,sad see the scenes hooligans pre #engrus why s...,sad see the scenes hooligans pre engrus why sp...,"[sad, see, the, scenes, hooligans, pre, engrus...","[sad, see, the, scene, hooligan, pre, engrus, ..."
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...,#gooddyeyoung #yoyoyo super happy apa the move...,gooddyeyoung yoyoyo super happy apa the movement,"[gooddyeyoung, yoyoyo, super, happy, apa, the,...","[gooddyeyoung, yoyoyo, super, happi, apa, the,..."
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...,queen evil bihday #lnic #lnicjustanevilbday #b...,queen evil bihday lnic lnicjustanevilbday bihd...,"[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday..."
3,18283,1,@user you might be a libtard if... #libtard #...,you might libtard #libtard #sjw #liberal #poli...,you might libtard libtard sjw liberal politics,"[you, might, libtard, libtard, sjw, liberal, p...","[you, might, libtard, libtard, sjw, liber, polit]"
4,25845,0,what are your goals? find out here... #smile...,what are your goals find out here #smile,what are your goals find out here smile,"[what, are, your, goals, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]"


In [38]:
df['lemmatized_tokens'] = df.tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x]) # lemmatizing
# [lemmatizer.lemmatize(word) for word in df.no_hash_tweet]
df.head()

Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens
0,29727,0,sad to see the scenes of hooligans pre #engrus...,sad see the scenes hooligans pre #engrus why s...,sad see the scenes hooligans pre engrus why sp...,"[sad, see, the, scenes, hooligans, pre, engrus...","[sad, see, the, scene, hooligan, pre, engrus, ...","[sad, see, the, scene, hooligan, pre, engrus, ..."
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...,#gooddyeyoung #yoyoyo super happy apa the move...,gooddyeyoung yoyoyo super happy apa the movement,"[gooddyeyoung, yoyoyo, super, happy, apa, the,...","[gooddyeyoung, yoyoyo, super, happi, apa, the,...","[gooddyeyoung, yoyoyo, super, happy, apa, the,..."
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...,queen evil bihday #lnic #lnicjustanevilbday #b...,queen evil bihday lnic lnicjustanevilbday bihd...,"[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday..."
3,18283,1,@user you might be a libtard if... #libtard #...,you might libtard #libtard #sjw #liberal #poli...,you might libtard libtard sjw liberal politics,"[you, might, libtard, libtard, sjw, liberal, p...","[you, might, libtard, libtard, sjw, liber, polit]","[you, might, libtard, libtard, sjw, liberal, p..."
4,25845,0,what are your goals? find out here... #smile...,what are your goals find out here #smile,what are your goals find out here smile,"[what, are, your, goals, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]"


In [39]:
df.stemmed_tokens[0][0]

'sad'

In [40]:
df['lem_tweet'] = [lemmatizer.lemmatize(word) for word in df.no_hash_tweet]

In [41]:
df['stem_tweet'] = [stemmer.stem(word) for word in df.no_hash_tweet]

In [42]:
df.head()

Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,29727,0,sad to see the scenes of hooligans pre #engrus...,sad see the scenes hooligans pre #engrus why s...,sad see the scenes hooligans pre engrus why sp...,"[sad, see, the, scenes, hooligans, pre, engrus...","[sad, see, the, scene, hooligan, pre, engrus, ...","[sad, see, the, scene, hooligan, pre, engrus, ...",sad see the scenes hooligans pre engrus why sp...,sad see the scenes hooligans pre engrus why sp...
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...,#gooddyeyoung #yoyoyo super happy apa the move...,gooddyeyoung yoyoyo super happy apa the movement,"[gooddyeyoung, yoyoyo, super, happy, apa, the,...","[gooddyeyoung, yoyoyo, super, happi, apa, the,...","[gooddyeyoung, yoyoyo, super, happy, apa, the,...",gooddyeyoung yoyoyo super happy apa the movement,gooddyeyoung yoyoyo super happy apa the mov
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...,queen evil bihday #lnic #lnicjustanevilbday #b...,queen evil bihday lnic lnicjustanevilbday bihd...,"[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...",queen evil bihday lnic lnicjustanevilbday bihd...,queen evil bihday lnic lnicjustanevilbday bihd...
3,18283,1,@user you might be a libtard if... #libtard #...,you might libtard #libtard #sjw #liberal #poli...,you might libtard libtard sjw liberal politics,"[you, might, libtard, libtard, sjw, liberal, p...","[you, might, libtard, libtard, sjw, liber, polit]","[you, might, libtard, libtard, sjw, liberal, p...",you might libtard libtard sjw liberal politics,you might libtard libtard sjw liberal polit
4,25845,0,what are your goals? find out here... #smile...,what are your goals find out here #smile,what are your goals find out here smile,"[what, are, your, goals, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]",what are your goals find out here smile,what are your goals find out here smil


In [43]:
df.label.value_counts(normalize=True)

0    0.929854
1    0.070146
Name: label, dtype: float64

In [44]:
df_0=df[df['label']==0]
df_0.head()

Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,29727,0,sad to see the scenes of hooligans pre #engrus...,sad see the scenes hooligans pre #engrus why s...,sad see the scenes hooligans pre engrus why sp...,"[sad, see, the, scenes, hooligans, pre, engrus...","[sad, see, the, scene, hooligan, pre, engrus, ...","[sad, see, the, scene, hooligan, pre, engrus, ...",sad see the scenes hooligans pre engrus why sp...,sad see the scenes hooligans pre engrus why sp...
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...,#gooddyeyoung #yoyoyo super happy apa the move...,gooddyeyoung yoyoyo super happy apa the movement,"[gooddyeyoung, yoyoyo, super, happy, apa, the,...","[gooddyeyoung, yoyoyo, super, happi, apa, the,...","[gooddyeyoung, yoyoyo, super, happy, apa, the,...",gooddyeyoung yoyoyo super happy apa the movement,gooddyeyoung yoyoyo super happy apa the mov
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...,queen evil bihday #lnic #lnicjustanevilbday #b...,queen evil bihday lnic lnicjustanevilbday bihd...,"[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...",queen evil bihday lnic lnicjustanevilbday bihd...,queen evil bihday lnic lnicjustanevilbday bihd...
4,25845,0,what are your goals? find out here... #smile...,what are your goals find out here #smile,what are your goals find out here smile,"[what, are, your, goals, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]",what are your goals find out here smile,what are your goals find out here smil
5,14297,0,retweets @user #nuascannan,retweets #nuascannan,retweets nuascannan,"[retweets, nuascannan]","[retweet, nuascannan]","[retweets, nuascannan]",retweets nuascannan,retweets nuascannan


In [45]:
df_1= df[df['label']==1]
df_1.head()

Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
3,18283,1,@user you might be a libtard if... #libtard #...,you might libtard #libtard #sjw #liberal #poli...,you might libtard libtard sjw liberal politics,"[you, might, libtard, libtard, sjw, liberal, p...","[you, might, libtard, libtard, sjw, liber, polit]","[you, might, libtard, libtard, sjw, liberal, p...",you might libtard libtard sjw liberal politics,you might libtard libtard sjw liberal polit
22,18207,1,rise up today is out! stories of police bruta...,rise today out stories police brutality from #...,rise today out stories police brutality from p...,"[rise, today, out, stories, police, brutality,...","[rise, today, out, stori, polic, brutal, from,...","[rise, today, out, story, police, brutality, f...",rise today out stories police brutality from p...,rise today out stories police brutality from p...
33,20139,1,@user ouch woow! @user @user #sexy @user @user...,ouch woow #sexy,ouch woow sexy,"[ouch, woow, sexy]","[ouch, woow, sexi]","[ouch, woow, sexy]",ouch woow sexy,ouch woow sexi
37,14894,1,"""vandals turned a jewish family's menorah into...",vandals turned jewish family menorah into swas...,vandals turned jewish family menorah into swas...,"[vandals, turned, jewish, family, menorah, int...","[vandal, turn, jewish, famili, menorah, into, ...","[vandal, turned, jewish, family, menorah, into...",vandals turned jewish family menorah into swas...,vandals turned jewish family menorah into swas...
44,24776,1,"""the reality is that the tech industry is made...",the reality that the tech industry made regula...,the reality that the tech industry made regula...,"[the, reality, that, the, tech, industry, made...","[the, realiti, that, the, tech, industri, made...","[the, reality, that, the, tech, industry, made...",the reality that the tech industry made regula...,the reality that the tech industry made regula...


In [46]:
df.head()

Unnamed: 0,id,label,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,29727,0,sad to see the scenes of hooligans pre #engrus...,sad see the scenes hooligans pre #engrus why s...,sad see the scenes hooligans pre engrus why sp...,"[sad, see, the, scenes, hooligans, pre, engrus...","[sad, see, the, scene, hooligan, pre, engrus, ...","[sad, see, the, scene, hooligan, pre, engrus, ...",sad see the scenes hooligans pre engrus why sp...,sad see the scenes hooligans pre engrus why sp...
1,14466,0,#gooddyeyoung #yoyoyo !! super happy to be ap...,#gooddyeyoung #yoyoyo super happy apa the move...,gooddyeyoung yoyoyo super happy apa the movement,"[gooddyeyoung, yoyoyo, super, happy, apa, the,...","[gooddyeyoung, yoyoyo, super, happi, apa, the,...","[gooddyeyoung, yoyoyo, super, happy, apa, the,...",gooddyeyoung yoyoyo super happy apa the movement,gooddyeyoung yoyoyo super happy apa the mov
2,18194,0,queen evil's bihdayð#lnic #lnicjustanevilbd...,queen evil bihday #lnic #lnicjustanevilbday #b...,queen evil bihday lnic lnicjustanevilbday bihd...,"[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...","[queen, evil, bihday, lnic, lnicjustanevilbday...",queen evil bihday lnic lnicjustanevilbday bihd...,queen evil bihday lnic lnicjustanevilbday bihd...
3,18283,1,@user you might be a libtard if... #libtard #...,you might libtard #libtard #sjw #liberal #poli...,you might libtard libtard sjw liberal politics,"[you, might, libtard, libtard, sjw, liberal, p...","[you, might, libtard, libtard, sjw, liber, polit]","[you, might, libtard, libtard, sjw, liberal, p...",you might libtard libtard sjw liberal politics,you might libtard libtard sjw liberal polit
4,25845,0,what are your goals? find out here... #smile...,what are your goals find out here #smile,what are your goals find out here smile,"[what, are, your, goals, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]","[what, are, your, goal, find, out, here, smile]",what are your goals find out here smile,what are your goals find out here smil


In [48]:
df.to_csv('data/cleaned-reshuffled.csv')

In [None]:
# .apply(eval)

In [25]:
# df=pd.read_csv('data/cleaned-reshuffled.csv')
# df.drop(['Unnamed: 0'], axis = 1, inplace= True)

## Data Visualization

In [None]:
# from PIL import Image
# cloud_mask = np.array(Image.open("twitter.png"))
# cloud_mask

In [None]:
from wordcloud import WordCloud

df_0_words = ' '.join([text for text in df['tidy_tweet'][df['label']==0]])
wordcloud = WordCloud(width=800, height=500, random_state=10, max_font_size=110).generate(df_0_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df_0_words

In [None]:
df_1_words = ' '.join([text for text in df['tidy_tweet'][df['label']==1]])

wordcloud = WordCloud(width=800, height=500, random_state=210, max_font_size=110).generate(df_1_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df_1_words

In [None]:
# function to collect hashtags
def hashtag_extract(tweet):
    hashtags = []
    # Loop over the words in the tweet
    for word in tweet:
        ht = re.findall(r"#(\w+)", word)
        hashtags.append(ht)

    return hashtags

In [None]:
# extracting hashtags from non racist/sexist tweets
HT_0 = hashtag_extract(df['tidy_tweet_2'][df['label']==0])

# extracting hashtags from racist/sexist tweets
HT_1 = hashtag_extract(df['tidy_tweet_2'][df['label']==1])


In [None]:
HT_0

In [None]:
# unnesting lists
HT_0 = sum(HT_0,[])
HT_1 = sum(HT_1,[])

In [None]:
HT_0

In [None]:
a = nltk.FreqDist(HT_0)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
b = nltk.FreqDist(HT_1)
e = pd.DataFrame({'Hashtag': list(b.keys()), 'Count': list(b.values())})
# selecting top 10 most frequent hashtags
e = e.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
meta_freqdist = FreqDist(HT_1)
meta_freqdist.most_common(10)

In [None]:
meta_freqdist.plot(10,cumulative=False)

In [None]:
meta_freqdist = FreqDist(HT_0)
meta_freqdist.most_common(10)

In [None]:
meta_freqdist.plot(10,cumulative=False)

## Bigram

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
meta_finder = BigramCollocationFinder.from_words(df['no_hash_tweet'])


In [None]:
bigram_scored = meta_finder.score_ngrams(bigram_measures.raw_freq)


In [None]:
bigram_scored