In [306]:
import pandas as pd
import re
import string
import spacy
from collections import Counter

import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words()
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

from gensim.parsing.preprocessing import remove_stopwords

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline

[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [282]:
#read in data
df = pd.read_csv('realdonaldtrump.csv')

In [283]:
#convert date to datetime object
df['date']= pd.to_datetime(df['date'])

In [284]:
#check for nulls in columns of interest
df.isna().sum()


id               0
link             0
content          0
date             0
retweets         0
favorites        0
mentions     22966
hashtags     37769
dtype: int64

In [285]:
#since mentions, hashtags, and links dont have much for us to analyze, lets drop
df=df.drop(['link','mentions','hashtags'], axis=1)


In [286]:
df.head()

Unnamed: 0,id,content,date,retweets,favorites
0,1698308935,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917
1,1701461182,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267
2,1737479987,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19
3,1741160716,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26
4,1773561338,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945


In [297]:
#im really only interested in trump tweets after he became a political figure, 
#so I'm dropping tweets from before he anncounced his canidacy
mask = (df['date'] > '2015-06-16')
df = df.loc[mask]
df

Unnamed: 0,id,content,date,retweets,favorites
23278,610704372415229952,""" @ realJoeMurray: Hopefully tomorrow is the d...",2015-06-16 02:04:03,40,68
23279,610704441872883712,"""@brentcfritz: Today is the day America become...",2015-06-16 02:04:19,55,99
23280,610710557285556224,""" @ insuraider: @ realDonaldTrump is going to ...",2015-06-16 02:28:37,32,59
23281,610710626269306880,""" @ DONJUBBER: @ realDonaldTrump Shock the wor...",2015-06-16 02:28:54,17,33
23282,610710697140441088,""" @ PianoBecca: @ realDonaldTrump No Amercian,...",2015-06-16 02:29:11,23,33
...,...,...,...,...,...
43347,1273405198698975232,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06-17 19:00:32,23402,116377
43348,1273408026968457216,Will be interviewed on @ seanhannity tonight a...,2020-06-17 19:11:47,11810,56659
43349,1273442195161387008,pic.twitter.com/3lm1spbU8X,2020-06-17 21:27:33,4959,19344
43350,1273442469066276864,pic.twitter.com/vpCE5MadUz,2020-06-17 21:28:38,4627,17022


In [300]:
df.reset_index()

Unnamed: 0,index,id,content,date,retweets,favorites
0,23278,610704372415229952,""" @ realJoeMurray: Hopefully tomorrow is the d...",2015-06-16 02:04:03,40,68
1,23279,610704441872883712,"""@brentcfritz: Today is the day America become...",2015-06-16 02:04:19,55,99
2,23280,610710557285556224,""" @ insuraider: @ realDonaldTrump is going to ...",2015-06-16 02:28:37,32,59
3,23281,610710626269306880,""" @ DONJUBBER: @ realDonaldTrump Shock the wor...",2015-06-16 02:28:54,17,33
4,23282,610710697140441088,""" @ PianoBecca: @ realDonaldTrump No Amercian,...",2015-06-16 02:29:11,23,33
...,...,...,...,...,...,...
20069,43347,1273405198698975232,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06-17 19:00:32,23402,116377
20070,43348,1273408026968457216,Will be interviewed on @ seanhannity tonight a...,2020-06-17 19:11:47,11810,56659
20071,43349,1273442195161387008,pic.twitter.com/3lm1spbU8X,2020-06-17 21:27:33,4959,19344
20072,43350,1273442469066276864,pic.twitter.com/vpCE5MadUz,2020-06-17 21:28:38,4627,17022


## Cleaning ##

In [303]:
def clean(text):
    #make lowercase, remove links, characters, punctuation, etc.
    text = text.lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('[’“”…]', '', text)
    
    # removing the stop-words          
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in stop_words]
    filtered_sentence = (" ").join(tokens_without_sw)
    text = filtered_sentence
    
    return text

In [304]:
# ls = LancasterStemmer()
# lem = WordNetLemmatizer()
# def lexicon_normalization(text):
#     words = word_tokenize(text)
#     words_stem = [ls.stem(w) for w in words]
#     words_lem = [lem.lemmatize(w) for w in words_stem]
    
#     return words_lem

In [314]:
df['clean'] = df['content'].apply(clean)
df

Unnamed: 0,id,content,date,retweets,favorites,clean
23278,610704372415229952,""" @ realJoeMurray: Hopefully tomorrow is the d...",2015-06-16 02:04:03,40,68,realjoemurray hopefully tomorrow day start mak...
23279,610704441872883712,"""@brentcfritz: Today is the day America become...",2015-06-16 02:04:19,55,99,brentcfritz today day america becomes great re...
23280,610710557285556224,""" @ insuraider: @ realDonaldTrump is going to ...",2015-06-16 02:28:37,32,59,insuraider realdonaldtrump going make hillarys...
23281,610710626269306880,""" @ DONJUBBER: @ realDonaldTrump Shock the wor...",2015-06-16 02:28:54,17,33,donjubber realdonaldtrump shock world trump4pr...
23282,610710697140441088,""" @ PianoBecca: @ realDonaldTrump No Amercian,...",2015-06-16 02:29:11,23,33,pianobecca realdonaldtrump amercian really imp...
...,...,...,...,...,...,...
43347,1273405198698975232,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06-17 19:00:32,23402,116377,joe biden total failure government bungled eve...
43348,1273408026968457216,Will be interviewed on @ seanhannity tonight a...,2020-06-17 19:11:47,11810,56659,interviewed seanhannity tonight 900 pm enjoy
43349,1273442195161387008,pic.twitter.com/3lm1spbU8X,2020-06-17 21:27:33,4959,19344,pictwittercom3lm1spbu8x
43350,1273442469066276864,pic.twitter.com/vpCE5MadUz,2020-06-17 21:28:38,4627,17022,pictwittercomvpce5maduz


## Tokenize ##

In [316]:
df['tokens'] = df['clean'].apply(word_tokenize)

In [317]:
df

Unnamed: 0,id,content,date,retweets,favorites,clean,tokens
23278,610704372415229952,""" @ realJoeMurray: Hopefully tomorrow is the d...",2015-06-16 02:04:03,40,68,realjoemurray hopefully tomorrow day start mak...,"[realjoemurray, hopefully, tomorrow, day, star..."
23279,610704441872883712,"""@brentcfritz: Today is the day America become...",2015-06-16 02:04:19,55,99,brentcfritz today day america becomes great re...,"[brentcfritz, today, day, america, becomes, gr..."
23280,610710557285556224,""" @ insuraider: @ realDonaldTrump is going to ...",2015-06-16 02:28:37,32,59,insuraider realdonaldtrump going make hillarys...,"[insuraider, realdonaldtrump, going, make, hil..."
23281,610710626269306880,""" @ DONJUBBER: @ realDonaldTrump Shock the wor...",2015-06-16 02:28:54,17,33,donjubber realdonaldtrump shock world trump4pr...,"[donjubber, realdonaldtrump, shock, world, tru..."
23282,610710697140441088,""" @ PianoBecca: @ realDonaldTrump No Amercian,...",2015-06-16 02:29:11,23,33,pianobecca realdonaldtrump amercian really imp...,"[pianobecca, realdonaldtrump, amercian, really..."
...,...,...,...,...,...,...,...
43347,1273405198698975232,Joe Biden was a TOTAL FAILURE in Government. H...,2020-06-17 19:00:32,23402,116377,joe biden total failure government bungled eve...,"[joe, biden, total, failure, government, bungl..."
43348,1273408026968457216,Will be interviewed on @ seanhannity tonight a...,2020-06-17 19:11:47,11810,56659,interviewed seanhannity tonight 900 pm enjoy,"[interviewed, seanhannity, tonight, 900, pm, e..."
43349,1273442195161387008,pic.twitter.com/3lm1spbU8X,2020-06-17 21:27:33,4959,19344,pictwittercom3lm1spbu8x,[pictwittercom3lm1spbu8x]
43350,1273442469066276864,pic.twitter.com/vpCE5MadUz,2020-06-17 21:28:38,4627,17022,pictwittercomvpce5maduz,[pictwittercomvpce5maduz]


## EDA ##

In [318]:
p = Counter(" ".join(df['clean']).split()).most_common(10)
# rslt = pd.DataFrame(p, columns=['Word', 'Frequency'])
print(rslt)

  Word  Frequency
0  the      18191
1   to      11289
2    @      11027
3  and      10107
4   of       8349
5    a       7237
6   is       6615
7   in       6441
8  for       5172
9    I       4704


## vectorize ##

In [319]:
vectorizer = CountVectorizer(stop_words = 'english')
doc_word = vectorizer.fit_transform(df.clean)
doc_word

<20074x23924 sparse matrix of type '<class 'numpy.int64'>'
	with 229113 stored elements in Compressed Sparse Row format>

In [320]:
df_words = pd.DataFrame(data=doc_word)
features = vectorizer.get_feature_names()
df_words = pd.DataFrame(doc_word.toarray(), columns=features)
df_words

Unnamed: 0,00,007cigarjoe,007llisav,00patriot,03,05,08,09,0nonsense,10,...,वसन,सक,सच,सद,सन,सपन,सबस,सम,हम,हर
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20070,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20072,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## topic modeling with NMF ##

In [321]:
nmf = NMF(n_components=10, random_state=42)

# fit the transfomed content with NMF
nmf.fit(doc_word)

# display the result
for index,topic in enumerate(nmf.components_):
    print(f"The top 20 words for topic # {index}")
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print("\n")



The top 20 words for topic # 0
['time', 'really', 'tonight', 'senator', 'wonderful', 'fantastic', 'military', 'book', 'country', 'governor', 'night', 'congratulations', 'day', 'today', 'honor', 'state', 'job', 'make', 'america', 'great']


The top 20 words for topic # 1
['second', 'loves', 'need', 'immigration', 'democrats', 'southern', 'borders', 'amendment', 'complete', 'vets', 'vote', 'total', 'endorsement', 'security', 'country', 'crime', 'military', 'strong', 'wall', 'border']


The top 20 words for topic # 2
['bush', 'election', 'lead', 'hes', 'carson', 'vote', 'true', 'rubio', 'cruz', 'administration', 'tower', 'cnn', '2016', 'gop', 'foxnews', 'campaign', 'mr', 'poll', 'donald', 'trump']


The top 20 words for topic # 3
['white', 'house', 'totally', 'dont', 'sources', 'stories', 'russia', 'dishonest', 'reporting', 'ratings', 'said', 'like', 'good', 'bad', 'corrupt', 'story', 'cnn', 'media', 'fake', 'news']


The top 20 words for topic # 4
['rating', 'approval', 'american', 'hono

## Topic Modeling w LDA ##

In [322]:
LDA = LatentDirichletAllocation(n_components = 10, n_jobs = -2, random_state = 42)

# fit the transfomed content with LDA
LDA.fit(doc_word)

# display the result
for index,topic in enumerate(LDA.components_):
    print(f"The top 20 words for topic # {index}")
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print("\n")

The top 20 words for topic # 0
['japan', 'replace', 'soon', 'whitehouse', 'rating', 'healthcare', 'approval', 'thank', 'looking', 'prime', 'meeting', 'american', 'minister', 'obamacare', 'party', 'look', 'today', 'republican', 'great', 'forward']


The top 20 words for topic # 1
['like', 'wow', 'american', 'time', 'ted', 'going', 'president', 'good', 'people', 'way', 'cruz', 'pelosi', 'nancy', 'white', 'trump2016', 'new', 'trump', 'realdonaldtrump', 'house', 'thank']


The top 20 words for topic # 2
['work', 'obama', 'good', 'illegal', 'united', 'iran', 'like', 'bad', 'states', 'dont', 'immigration', 'president', 'great', 'security', 'hillary', 'wall', 'country', 'democrats', 'border', 'people']


The top 20 words for topic # 3
['report', 'time', 'bad', 'story', 'fbi', 'crooked', 'clinton', 'russia', 'collusion', 'said', 'hunt', 'witch', 'people', 'hillary', 'democrats', 'president', 'trump', 'media', 'fake', 'news']


The top 20 words for topic # 4
['rubio', 'live', 'night', 'pm', 'in

## NMF seems to give more interesting AND intuitive results ##

In [None]:
## tf idf too