In [None]:
import gensim
import os
import pandas as pd
from gensim.corpora.dictionary import Dictionary 



In [None]:
tweets = pd.read_csv("https://raw.githubusercontent.com/sashaperigo/Trump-Tweets/master/data.csv").dropna()
tweets.head()

Unnamed: 0,Text,Date,Favorites,Retweets,Tweet ID
0,Nielson Media Research final numbers on ACCEPT...,2016-07-30 23:32:40,13850,4130,759592590106849280
1,Thank you to all of the television viewers tha...,2016-07-30 19:00:07,27659,6842,759524001613918208
2,Can you imagine if I had the small crowds that...,2016-07-30 18:28:22,19968,6488,759516008272932864
3,NATO commander agrees members should pay up vi...,2016-07-30 18:24:40,11624,4668,759515080010719232
4,"Wow, NATO's top commander just announced that ...",2016-07-30 18:18:58,23922,7819,759513644258525184


In [None]:
# Tokenize tweets, while stoplisting, case-folding, and filtering
from nltk import word_tokenize
from nltk.corpus import stopwords
stoplist = stopwords.words('english')

def clean_tweet(tweet):
    lower = tweet.lower()
    # Small hack to keep hashtags without modifying tokenizer:
    #   replace # with text, then replace back later
    terms = word_tokenize(lower.replace("#", "HASH_"))
    terms_stopped = [term for term in terms if term not in stoplist]
    terms_alpha = [term for term in terms_stopped if (term.isalpha() or "HASH_" in term)]
    if len(terms_alpha) == 0:
        return pd.Series()
    else:
        return pd.Series(terms_alpha).str.replace("HASH_", "#")
    
clean_tweet("This is #a test")

0      #a
1    test
dtype: object

In [None]:
# Create a 'long' dataframe of term counts
tweet_words = tweets['Text'].str.lower().apply(clean_tweet)
tweet_words.index = tweets['Tweet ID']

word_counts = (tweet_words.stack().to_frame()
                          .reset_index()
                          .rename(columns={0:'word', 'level_1':'count'})
                          .groupby(['Tweet ID', 'word'], as_index=False).count()
              )

# Filter to words that have been used 5 or more times
words_filtered = word_counts.groupby('word').filter(lambda x: x['count'].sum() >= 5)

# Make 'wide' dataframe, i.e. a document-term matrix
trump_counts = words_filtered.pivot(index='Tweet ID', columns='word', values='count').fillna(0)
trump_counts.head()

word,#,#1,#2,#2016,#2a,#alsicebucketchallenge,#america,#americafirst,#apprentice,#autism,...,yrs,yuan,zero,zimmerman,zogby,zone,zones,zucker,zuckerman,zuker
Tweet ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1698308935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1701461182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1737479987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1741160716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1773561338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The size of our document-term matrix, `count(tweets) x count(unique_words)`:

In [None]:
q = trump_counts.loc[:,["donald"]].query('donald > 1').index.values
tweets[tweets["Tweet ID"].isin(q)]['Text'].head()

901     "@NathanDWilsonFL: @MariaBartiromo you had a g...
2821    "@AniesiODaniels: #DemDebate Q: Who are you vo...
3646    "@TradingStreetCo:Donald Trump Is Ratings ‘Gol...
4359    "@moshe_mkmdca: @realDonaldTrump @007lLisav @C...
4981    "@jimlibertarian:  @SlwStdySque Donald has alr...
Name: Text, dtype: object

In [None]:
# Number all the columns and create a gensim dictionary
dictionary = Dictionary()
dictionary.token2id = dict(zip(trump_counts.columns, range(0, trump_counts.shape[1])))

In [None]:
# If I haven't already trained and saved a model, train it now
if not os.path.exists('trump-tweets.pickle'):
    # Train a model
    # Gensim has a way to read numpy arrays, but they use columns for documents - so rotate ('transpose') the DataFrame
    corpus = gensim.matutils.Dense2Corpus(trump_counts.values.T)
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary,
                                          num_topics=20, update_every=1, chunksize=1000, passes=6, alpha='auto')
    lda.save('trump-tweets.pickle')
else:
    # Load a model
    lda = gensim.models.ldamodel.LdaModel.load('trump-tweets.pickle')

In [None]:
print("\n".join(["%d\t%s" % info for info in lda.show_topics(num_topics=20, num_words=6)]))

0	0.060*"nice" + 0.054*"got" + 0.052*"wow" + 0.050*"say" + 0.038*"nothing" + 0.032*"wonderful"
1	0.070*"cnn" + 0.068*"poll" + 0.046*"think" + 0.041*"true" + 0.037*"day" + 0.036*"man"
2	0.279*"thank" + 0.090*"vote" + 0.087*"big" + 0.036*"crowd" + 0.035*"needs" + 0.018*"apprentice"
3	0.092*"clinton" + 0.043*"megynkelly" + 0.042*"ever" + 0.029*"presidential" + 0.024*"women" + 0.021*"truth"
4	0.098*"people" + 0.072*"get" + 0.048*"cruz" + 0.047*"many" + 0.040*"bad" + 0.036*"really"
5	0.181*"http" + 0.139*"trump" + 0.103*"donald" + 0.045*"via" + 0.022*"morning" + 0.020*"hampshire"
6	0.090*"make" + 0.087*"foxnews" + 0.063*"win" + 0.040*"gop" + 0.039*"interview" + 0.038*"foxandfriends"
7	0.043*"hope" + 0.038*"watching" + 0.032*"person" + 0.031*"far" + 0.028*"year" + 0.027*"party"
8	0.070*"see" + 0.050*"know" + 0.047*"tomorrow" + 0.045*"speech" + 0.037*"let" + 0.037*"years"
9	0.119*"#makeamericagreatagain" + 0.043*"support" + 0.040*"campaign" + 0.040*"jobs" + 0.035*"american" + 0.034*"join"
10	

In [None]:
lda.state.get_lambda().shape

(20, 5445)

In [None]:
a = lda.get_document_topics(corpus.dense.tolist())

ValueError: too many values to unpack (expected 2)