In [32]:
from __future__ import print_function

import pandas as pd 
import numpy as np 
import sklearn

# NLTK/NLP
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk
from nltk import FreqDist, word_tokenize
import string, re
import urllib
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import word2vec
from nltk.collocations import *

#Visualization
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

import requests
from bs4 import BeautifulSoup

import pickle

import warnings 
warnings.filterwarnings("ignore")


In [2]:
%run my_functions.ipynb

In [11]:
def get_tweets (handle):

    tweet_list = []
             
    twitter_url = requests.get('https://twitter.com/' + handle).text

    soup = BeautifulSoup(twitter_url,'lxml')

    tweets = soup.find_all("p")

    tweets = soup.find_all("p", {"class" : "TweetTextSize"})

    for tweet in tweets:
            
        tweet_list.append((handle, tweet.text))
            
    df = pd.DataFrame(tweet_list, columns=['handle', 'tweet'])
    
    return df

In [14]:
df = get_tweets('realDonaldTrump')
df

Unnamed: 0,handle,tweet
0,realDonaldTrump,MAKE AMERICA GREAT AGAIN!
1,realDonaldTrump,With all that this Administration has accompli...
2,realDonaldTrump,.....”Journalism” has reached a new low in the...
3,realDonaldTrump,"The Failing New York Times, in one of the most..."
4,realDonaldTrump,pic.twitter.com/JDS4zVfyBe
5,realDonaldTrump,I read that Trump rally was rambling/half-empt...
6,realDonaldTrump,Coming up this Sunday on Full Measure - the me...
7,realDonaldTrump,"From what I can tell, he was not there, it was..."
8,realDonaldTrump,SAVED &Peace & Prosperity @realDonaldTrump OUR...
9,realDonaldTrump,Sunday on Full Measure: Claims of media bias a...


In [18]:
# with open('scraped_trup_tweet.pkl', 'wb') as f:
# 	pickle.dump(df, f)

with open('scraped_trup_tweet.pkl', 'rb') as f:
	df = pickle.load(f)

In [19]:
df.head(10)

Unnamed: 0,handle,tweet
0,realDonaldTrump,MAKE AMERICA GREAT AGAIN!
1,realDonaldTrump,With all that this Administration has accompli...
2,realDonaldTrump,.....”Journalism” has reached a new low in the...
3,realDonaldTrump,"The Failing New York Times, in one of the most..."
4,realDonaldTrump,pic.twitter.com/JDS4zVfyBe
5,realDonaldTrump,I read that Trump rally was rambling/half-empt...
6,realDonaldTrump,Coming up this Sunday on Full Measure - the me...
7,realDonaldTrump,"From what I can tell, he was not there, it was..."
8,realDonaldTrump,SAVED &Peace & Prosperity @realDonaldTrump OUR...
9,realDonaldTrump,Sunday on Full Measure: Claims of media bias a...


In [22]:
df.drop('handle', axis=1, inplace = True)

In [53]:
df.head()

Unnamed: 0,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,MAKE AMERICA GREAT AGAIN!,MAKE AMERICA GREAT AGAIN,MAKE AMERICA GREAT AGAIN,"[MAKE, AMERICA, GREAT, AGAIN]","[make, america, great, again]","[MAKE, AMERICA, GREAT, AGAIN]",MAKE AMERICA GREAT AGAIN,make america great again
1,With all that this Administration has accompli...,With all that this Administration has accompli...,With all that this Administration has accompli...,"[With, all, that, this, Administration, has, a...","[with, all, that, this, administr, has, accomp...","[With, all, that, this, Administration, ha, ac...",With all that this Administration has accompli...,with all that this administration has accompli...
2,.....”Journalism” has reached a new low in the...,Journalism has reached a new low in the histor...,Journalism has reached a new low in the histor...,"[Journalism, has, reached, a, new, low, in, th...","[journal, has, reach, a, new, low, in, the, hi...","[Journalism, ha, reached, a, new, low, in, the...",Journalism has reached a new low in the histor...,journalism has reached a new low in the histor...
3,"The Failing New York Times, in one of the most...",The Failing New York Times in one of the most ...,The Failing New York Times in one of the most ...,"[The, Failing, New, York, Times, in, one, of, ...","[the, fail, new, york, time, in, one, of, the,...","[The, Failing, New, York, Times, in, one, of, ...",The Failing New York Times in one of the most ...,the failing new york times in one of the most ...
4,pic.twitter.com/JDS4zVfyBe,pic twitter com JDS zVfyBe,pic twitter com JDS zVfyBe,"[pic, twitter, com, JDS, zVfyBe]","[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, JDS, zVfyBe]",pic twitter com JDS zVfyBe,pic twitter com jds zvfyb


In [40]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt    

In [41]:
df['tidy_tweet'] = np.vectorize(remove_pattern)(df['tweet'], "@[\w]*")


In [42]:
df['tidy_tweet'] = df['tidy_tweet'].str.replace("[^a-zA-Z#]", " ")


In [43]:
df['tidy_tweet']= df['tidy_tweet'].apply(lambda x: ' '.join([w for w in x.split()]))

In [54]:
df['tidy_tweet']= df['tidy_tweet'].apply(lambda x: x.lower())

In [56]:
df['no_hash_tweet']= df['tidy_tweet'].str.replace("#", "")

In [57]:
df['tokenized_tweet'] = df['no_hash_tweet'].apply(lambda x: x.split())
df.head()

Unnamed: 0,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,MAKE AMERICA GREAT AGAIN!,make america great again,make america great again,"[make, america, great, again]","[make, america, great, again]","[MAKE, AMERICA, GREAT, AGAIN]",MAKE AMERICA GREAT AGAIN,make america great again
1,With all that this Administration has accompli...,with all that this administration has accompli...,with all that this administration has accompli...,"[with, all, that, this, administration, has, a...","[with, all, that, this, administr, has, accomp...","[With, all, that, this, Administration, ha, ac...",With all that this Administration has accompli...,with all that this administration has accompli...
2,.....”Journalism” has reached a new low in the...,journalism has reached a new low in the histor...,journalism has reached a new low in the histor...,"[journalism, has, reached, a, new, low, in, th...","[journal, has, reach, a, new, low, in, the, hi...","[Journalism, ha, reached, a, new, low, in, the...",Journalism has reached a new low in the histor...,journalism has reached a new low in the histor...
3,"The Failing New York Times, in one of the most...",the failing new york times in one of the most ...,the failing new york times in one of the most ...,"[the, failing, new, york, times, in, one, of, ...","[the, fail, new, york, time, in, one, of, the,...","[The, Failing, New, York, Times, in, one, of, ...",The Failing New York Times in one of the most ...,the failing new york times in one of the most ...
4,pic.twitter.com/JDS4zVfyBe,pic twitter com jds zvfybe,pic twitter com jds zvfybe,"[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, JDS, zVfyBe]",pic twitter com JDS zVfyBe,pic twitter com jds zvfyb


In [58]:
stemmer = SnowballStemmer("english")
df['stemmed_tokens'] = df.tokenized_tweet.apply(lambda x: [stemmer.stem(i) for i in x]) # stemming
df.head()


Unnamed: 0,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,MAKE AMERICA GREAT AGAIN!,make america great again,make america great again,"[make, america, great, again]","[make, america, great, again]","[MAKE, AMERICA, GREAT, AGAIN]",MAKE AMERICA GREAT AGAIN,make america great again
1,With all that this Administration has accompli...,with all that this administration has accompli...,with all that this administration has accompli...,"[with, all, that, this, administration, has, a...","[with, all, that, this, administr, has, accomp...","[With, all, that, this, Administration, ha, ac...",With all that this Administration has accompli...,with all that this administration has accompli...
2,.....”Journalism” has reached a new low in the...,journalism has reached a new low in the histor...,journalism has reached a new low in the histor...,"[journalism, has, reached, a, new, low, in, th...","[journal, has, reach, a, new, low, in, the, hi...","[Journalism, ha, reached, a, new, low, in, the...",Journalism has reached a new low in the histor...,journalism has reached a new low in the histor...
3,"The Failing New York Times, in one of the most...",the failing new york times in one of the most ...,the failing new york times in one of the most ...,"[the, failing, new, york, times, in, one, of, ...","[the, fail, new, york, time, in, one, of, the,...","[The, Failing, New, York, Times, in, one, of, ...",The Failing New York Times in one of the most ...,the failing new york times in one of the most ...
4,pic.twitter.com/JDS4zVfyBe,pic twitter com jds zvfybe,pic twitter com jds zvfybe,"[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, JDS, zVfyBe]",pic twitter com JDS zVfyBe,pic twitter com jds zvfyb


In [59]:
lemmatizer = WordNetLemmatizer()
df['lemmatized_tokens'] = df.tokenized_tweet.apply(lambda x: [lemmatizer.lemmatize(i) for i in x]) # lemmatizing
# [lemmatizer.lemmatize(word) for word in df.no_hash_tweet]
df.head()

Unnamed: 0,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,MAKE AMERICA GREAT AGAIN!,make america great again,make america great again,"[make, america, great, again]","[make, america, great, again]","[make, america, great, again]",MAKE AMERICA GREAT AGAIN,make america great again
1,With all that this Administration has accompli...,with all that this administration has accompli...,with all that this administration has accompli...,"[with, all, that, this, administration, has, a...","[with, all, that, this, administr, has, accomp...","[with, all, that, this, administration, ha, ac...",With all that this Administration has accompli...,with all that this administration has accompli...
2,.....”Journalism” has reached a new low in the...,journalism has reached a new low in the histor...,journalism has reached a new low in the histor...,"[journalism, has, reached, a, new, low, in, th...","[journal, has, reach, a, new, low, in, the, hi...","[journalism, ha, reached, a, new, low, in, the...",Journalism has reached a new low in the histor...,journalism has reached a new low in the histor...
3,"The Failing New York Times, in one of the most...",the failing new york times in one of the most ...,the failing new york times in one of the most ...,"[the, failing, new, york, times, in, one, of, ...","[the, fail, new, york, time, in, one, of, the,...","[the, failing, new, york, time, in, one, of, t...",The Failing New York Times in one of the most ...,the failing new york times in one of the most ...
4,pic.twitter.com/JDS4zVfyBe,pic twitter com jds zvfybe,pic twitter com jds zvfybe,"[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, jds, zvfybe]",pic twitter com JDS zVfyBe,pic twitter com jds zvfyb


In [60]:
df.stemmed_tokens[0][0]

'make'

In [61]:
df['lem_tweet'] = [lemmatizer.lemmatize(word) for word in df.no_hash_tweet]

In [62]:
df['stem_tweet'] = [stemmer.stem(word) for word in df.no_hash_tweet]

In [63]:
df.head()

Unnamed: 0,tweet,tidy_tweet,no_hash_tweet,tokenized_tweet,stemmed_tokens,lemmatized_tokens,lem_tweet,stem_tweet
0,MAKE AMERICA GREAT AGAIN!,make america great again,make america great again,"[make, america, great, again]","[make, america, great, again]","[make, america, great, again]",make america great again,make america great again
1,With all that this Administration has accompli...,with all that this administration has accompli...,with all that this administration has accompli...,"[with, all, that, this, administration, has, a...","[with, all, that, this, administr, has, accomp...","[with, all, that, this, administration, ha, ac...",with all that this administration has accompli...,with all that this administration has accompli...
2,.....”Journalism” has reached a new low in the...,journalism has reached a new low in the histor...,journalism has reached a new low in the histor...,"[journalism, has, reached, a, new, low, in, th...","[journal, has, reach, a, new, low, in, the, hi...","[journalism, ha, reached, a, new, low, in, the...",journalism has reached a new low in the histor...,journalism has reached a new low in the histor...
3,"The Failing New York Times, in one of the most...",the failing new york times in one of the most ...,the failing new york times in one of the most ...,"[the, failing, new, york, times, in, one, of, ...","[the, fail, new, york, time, in, one, of, the,...","[the, failing, new, york, time, in, one, of, t...",the failing new york times in one of the most ...,the failing new york times in one of the most ...
4,pic.twitter.com/JDS4zVfyBe,pic twitter com jds zvfybe,pic twitter com jds zvfybe,"[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, jds, zvfybe]","[pic, twitter, com, jds, zvfybe]",pic twitter com jds zvfybe,pic twitter com jds zvfyb


In [65]:
with open('trump_tweet_cleaned.pkl', 'wb') as f:
	pickle.dump(df, f)

## Data Visualization

In [None]:
# from PIL import Image
# cloud_mask = np.array(Image.open("twitter.png"))
# cloud_mask

In [None]:
df_0_words = ' '.join([text for text in df['tidy_tweet'][df['label']==0]])
wordcloud = WordCloud(width=800, height=500, random_state=10, max_font_size=110).generate(df_0_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
df_1_words = ' '.join([text for text in df['tidy_tweet'][df['label']==1]])

wordcloud = WordCloud(width=800, height=500, random_state=210, max_font_size=110).generate(df_1_words)

plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

In [None]:
# function to collect hashtags
def hashtag_extract(tweet):
    hashtags = []
    # Loop over the words in the tweet
    for word in tweet:
        ht = re.findall(r"#(\w+)", word)
        hashtags.append(ht)

    return hashtags

In [None]:
# extracting hashtags from non racist/sexist tweets
HT_0 = hashtag_extract(df['tidy_tweet_2'][df['label']==0])

# extracting hashtags from racist/sexist tweets
HT_1 = hashtag_extract(df['tidy_tweet_2'][df['label']==1])


In [None]:
HT_0

In [None]:
# unnesting lists
HT_0 = sum(HT_0,[])
HT_1 = sum(HT_1,[])

In [None]:
HT_0

In [None]:
a = nltk.FreqDist(HT_0)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
b = nltk.FreqDist(HT_1)
e = pd.DataFrame({'Hashtag': list(b.keys()), 'Count': list(b.values())})
# selecting top 10 most frequent hashtags
e = e.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.show()

In [None]:
meta_freqdist = FreqDist(HT_1)
meta_freqdist.most_common(10)

In [None]:
meta_freqdist.plot(10,cumulative=False)

In [None]:
meta_freqdist = FreqDist(HT_0)
meta_freqdist.most_common(10)

In [None]:
meta_freqdist.plot(10,cumulative=False)

## Bigram

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
meta_finder = BigramCollocationFinder.from_words(df['no_hash_tweet'])


In [None]:
bigram_scored = meta_finder.score_ngrams(bigram_measures.raw_freq)


In [None]:
bigram_scored