In [1]:
import pandas as pd
import numpy as np

In [2]:
#import data (duplicate Tweets already removed using Twarc)

df = pd.read_csv("data/huckabee.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22370 entries, 0 to 22369
Data columns (total 37 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            22370 non-null  int64  
 1   tweet_url                     22370 non-null  object 
 2   created_at                    22370 non-null  object 
 3   parsed_created_at             22370 non-null  object 
 4   user_screen_name              22370 non-null  object 
 5   text                          22370 non-null  object 
 6   tweet_type                    22370 non-null  object 
 7   coordinates                   0 non-null      float64
 8   hashtags                      851 non-null    object 
 9   media                         1193 non-null   object 
 10  urls                          2897 non-null   object 
 11  favorite_count                22370 non-null  int64  
 12  in_reply_to_screen_name       2661 non-null   object 
 13  i

In [4]:


# I'll be working only with Tweets in English
# drop tweets in all other languages
# now working with 38884 Tweets 

df = df[df.lang == 'en']
df.shape

# after non-English removed, just 21860 Tweets left 

(21860, 37)

In [5]:
# drop unnecessary columns 

df = df.drop(['tweet_url', 'created_at', 'media', 'urls','in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_or_quote_id',
       'retweet_or_quote_screen_name', 'retweet_or_quote_user_id', 'source',
       'user_created_at', 'user_name', 'user_verified', 'user_friends_count', 'user_listed_count',
       'user_statuses_count', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'coordinates', 'lang', 'user_location', 'user_time_zone', 'user_urls', 'place'], axis=1)

In [6]:
# drop retweets

df = df[df.tweet_type != 'retweet']

In [7]:
df.info()
#after retweets removed, just 4752 Tweets left

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4752 entries, 8 to 22362
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   id                  4752 non-null   int64 
 1   parsed_created_at   4752 non-null   object
 2   user_screen_name    4752 non-null   object
 3   text                4752 non-null   object
 4   tweet_type          4752 non-null   object
 5   hashtags            277 non-null    object
 6   favorite_count      4752 non-null   int64 
 7   possibly_sensitive  1746 non-null   object
 8   retweet_count       4752 non-null   int64 
 9   user_id             4752 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 408.4+ KB


In [8]:
# check end time & date of data

df.iloc[0]

# last Tweet downloaded 2021-01-25 03:06:31+00:00

id                                                  1353539721473961986
parsed_created_at                             2021-01-25 03:06:31+00:00
user_screen_name                                              ElectLucy
text                  🚨#BREAKING : Sarah Huckabee #Sanders  expected...
tweet_type                                                        quote
hashtags                                      BREAKING Sanders Arkansas
favorite_count                                                        0
possibly_sensitive                                                False
retweet_count                                                         0
user_id                                             1207718617707687943
Name: 8, dtype: object

In [9]:
# check start time & date of data 

df.iloc[-1]

# first Tweet on  2021-01-17 01:11:36+00:00

id                                                  1350611696675647490
parsed_created_at                             2021-01-17 01:11:36+00:00
user_screen_name                                                 vahb17
text                  Dershowitz on CBN w Huckabee just compared Tru...
tweet_type                                                     original
hashtags                                                            NaN
favorite_count                                                        1
possibly_sensitive                                                  NaN
retweet_count                                                         0
user_id                                             1155783466983862272
Name: 22362, dtype: object

In [10]:
# create variable for "text" column 

text = df['text'] 

In [11]:
# tokenize, remove stopwords, remove urls, lowercase, remove punctuation, remove numbers

# import necessary libraries: ntlk etc.

import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 


stop = stopwords.words('english')

punc = list(set(string.punctuation))

def tokenizer(text):
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r'', text)

def process_text(text):
    text = remove_url(text)
    text = tokenizer(text)
    text = [word.lower() for word in text]
    text = [re.sub('[0-9]+', '', word) for word in text]
    text = [word for word in text if word not in punc]
    text = [word for word in text if word not in stop]
    text = [each for each in text if len(each) > 1]
    text = [word for word in text if ' ' not in word]
     
    return text

In [12]:
# apply text processing functions to text

df['processed_text'] = df['text'].apply(process_text)

In [13]:
# look at some of processed text

pd.set_option('display.max_colwidth', -1)
df['processed_text'][:20]

  This is separate from the ipykernel package so we can avoid doing imports until


8      [#breaking, sarah, huckabee, #sanders, expected, announce, run, governor, #arkansas]                                                                                                                                                            
11     [yes, sarah, huckabee, sanders, good, chance, becoming, next, arkansas, governor, take, look, many, states, run, republican, governors]                                                                                                         
16     [@palmerreport, sarah, huckabee, sanders, running, governor, arkansas]                                                                                                                                                                          
22     [headline, four, years, governor, arkansas, former, white, house, press, secretary, sarah, huckabee, sanders, becomes, first, elected, official, endorse, donald, trump, second, re-election, bid]                                              
26     [

In [14]:
# part-of-speech tagging 

ready_for_pos = df['processed_text']

def pos_tagging(text):
    pos_tag = [pos_tag(word) for word in ready_for_pos]

df['pos_tagged'] = df.processed_text.apply(lambda x: pos_tag(x))

In [15]:
# lemmatizing

pos_tagged = df['pos_tagged']

wordnet = WordNetLemmatizer() 

lemmatized = [[wordnet.lemmatize(word[0]) for word in words] for words in pos_tagged]

In [16]:
# look at lemmatized text

df['lemmatized'] = lemmatized
lemmatized[:20]

[['#breaking',
  'sarah',
  'huckabee',
  '#sanders',
  'expected',
  'announce',
  'run',
  'governor',
  '#arkansas'],
 ['yes',
  'sarah',
  'huckabee',
  'sander',
  'good',
  'chance',
  'becoming',
  'next',
  'arkansas',
  'governor',
  'take',
  'look',
  'many',
  'state',
  'run',
  'republican',
  'governor'],
 ['@palmerreport',
  'sarah',
  'huckabee',
  'sander',
  'running',
  'governor',
  'arkansas'],
 ['headline',
  'four',
  'year',
  'governor',
  'arkansas',
  'former',
  'white',
  'house',
  'press',
  'secretary',
  'sarah',
  'huckabee',
  'sander',
  'becomes',
  'first',
  'elected',
  'official',
  'endorse',
  'donald',
  'trump',
  'second',
  're-election',
  'bid'],
 ['nbc',
  'news',
  'sarah',
  'huckabee',
  'sander',
  'ex-trump',
  'press',
  'secretary',
  'run',
  'arkansas',
  'governor',
  'via',
  '@googlenews'],
 ['@acnewsitics',
  'arkansas',
  'beautiful',
  'state',
  'many',
  'good',
  'people',
  'sarah',
  'huckabee',
  'sander',
  'one']

In [17]:
# before vectorizing, cast lists of words back into strings

df['final_docs'] = df['lemmatized'].apply(lambda x: " ".join(x))
pd.set_option('display.max_colwidth', -1)
final_docs = df['final_docs']
final_docs[3000:3020]

  after removing the cwd from sys.path.


16245    watched full white house press briefing since huckabee day surreal civil one four year abuse lie boring sometimes that's actually good thing people work without worrying personally attacked                                
16246    @ianhowardmann looooooong month huckabee held presser                                                                                                                                                                        
16247    @danpfeiffer true difference huckabee mcenany ... like new tesla v high milage chevrolet malibu                                                                                                                              
16248    @radioguychris forgotten press briefing miss sarah huckabee                                                                                                                                                                  
16251    @highwayjill @bjornbecker @projectlincoln amen brother spicer hucka

In [18]:
#create document term matrix with TFIDF

#import vectorizing tool (usee TFIDF)
from sklearn.feature_extraction.text import TfidfVectorizer
# set max_features to 2000 (specifies the number of most frequently occurring words for which we want to create feature vectors)
# set min_df to 5 (word must occur in at least 5 documents)
# set max_df to 0.85 (word must not occur in more than 85 percent of the documents) 

tfidfconverter = TfidfVectorizer(max_features=2000, min_df=5, max_df=0.85, ngram_range=(1, 2), stop_words='english')  
doc_term_matrix_1 = tfidfconverter.fit_transform(df['final_docs'].values.astype('U'))

In [19]:
#run NMF model 

#import NMF tool 
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=6)
nmf_Z = nmf_model.fit_transform(doc_term_matrix_1)

In [28]:
# run LDA model

#import LDA tool 
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components = 5, max_iter=40, learning_method='online', learning_decay=.5)
lda_Z = lda_model.fit_transform(doc_term_matrix_1)

In [27]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, tfidfconverter )
print("=" * 30)
 
print("NMF Model:")
print_topics(nmf_model, tfidfconverter )
print("=" * 30)

LDA Model:
Topic 0:
[('sarah', 52.381785338837524), ('sarah huckabee', 50.42687647198881), ('sander', 45.361562704555844), ('realtuckfrumper', 43.66322001992715), ('huckabee sander', 43.131185111906476), ('brandonstraka', 33.77145605604785), ('sarahhuckabee', 22.9894075748786), ('trump', 22.10249640282913), ('press', 21.669105104107075), ('forget', 20.6675240571816)]
Topic 1:
[('biden', 38.869903216945886), ('day', 36.768820142594464), ('press', 28.551704838205286), ('democrat', 25.712688425823128), ('mike', 25.416057686419336), ('mike huckabee', 25.168587815901798), ('disaster', 22.708071042548575), ('huckabee day', 22.455440520191548), ('president', 22.36929524365562), ('day biden', 22.126173856758292)]
Topic 2:
[('mike', 69.92748500249498), ('mike huckabee', 68.42508066231123), ('youtube', 37.82954945847945), ('huckabee youtube', 34.889133428879944), ('left', 33.93972988641999), ('travis_huckabee', 32.973692122716514), ('govmikehuckabee', 27.43458182987372), ('successfully', 25.0692

In [22]:
# cross-validation to find best parameters for LDA model

#import cross-validation tool
from sklearn.model_selection import GridSearchCV

# define search parameters
search_params = {'n_components': [5, 8, 10, 12], 'learning_decay': [.5, .7, .9]}

# initialize model for cross-validation
lda = LatentDirichletAllocation()

# initialize grid search class 
model = GridSearchCV(lda, param_grid=search_params)

# run grid search 
model.fit(doc_term_matrix_1)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 8, 10, 12]})

In [23]:
# what is best model? 
best_lda_model = model.best_estimator_

# print parameters for best model
print("Parameters for best model: ", model.best_params_)

# print log likelihood score
print("Log likelihood score of best model: ", model.best_score_)

# print perplexity score
print("Perplexity score of best model: ", best_lda_model.perplexity(doc_term_matrix_1))

Parameters for best model:  {'learning_decay': 0.5, 'n_components': 5}
Log likelihood score of best model:  -27756.90461543678
Perplexity score of best model:  2173.1791414949594


In [29]:

topic_values = lda_model.transform(doc_term_matrix_1)
df['LDA_topic'] = topic_values.argmax(axis=1)

In [30]:
df['text'][df.LDA_topic ==0][1000:1050]

20252    @AnaCabrera Hogan Gidley is the Sarah Huckabee Sanders of 2021.                                                                                                                                                                                                                                     
20374    For me Sarah Huckabee is one of our most shining leaders in America First Populism.  Love your values, Sarah!  Run for Governor.                                                                                                                                                                    
20453    The REAL DONALD TRUMP The Media Won't Tell You About! | Sarah Huckabee S... https://t.co/eCXUwCNkn5 via @YouTube                                                                                                                                                                                    
20495    @GovMikeHuckabee @donmclean @HuckabeeOnTBN Mr. Huckabee, to see you compare Jeffry Da

In [33]:
df['text'][df.LDA_topic ==1][200:250]

13684    @kayleighmcenany Have you joined OANN yet?   I bet Chantal won't like it when that happens.  Or maybe you'll be the teleprompter on the Mike Huckabee show.                                                                                                                             
13708    @GovMikeHuckabee CNN:. Mike Huckabee, the former Arkansas governor, said it was because President Barack Obama "probably grew up hearing that the British were a bunch of imperialists who persecuted his grandfather."                                                                 
13759    @anches The breath of fresh air from Sarah Huckabee Sanders and the others is so welcome. She’s a pro.                                                                                                                                                                                  
13767    @DJJudd What? Would you prefer that bovine Huckabee?                                                                     

In [34]:
df['text'][df.LDA_topic ==2][200:250]

4199    @Dbwagner104 I live in Arkansas.\n\nPray for us, please. 😂\n\nHuckabee was horrible, Sanders probably even worse.                                                                                                                                                                                                           
4207    @HillaryClinton IS THERE ANYONE IN ARKANSAS WHO CAN RUN AGAINST SARAH HUCKABEE IN THE GOVERNOR'S RACE?  She should NEVER hold public office.                                                                                                                                                                                
4241    Arkansas going to have Huckabee Sanders as governor and Cotton as senator. About to challenge Florida for worst delegation.                                                                                                                                                                                                 
4260    @RickyForSenate N

In [35]:
df['text'][df.LDA_topic ==3][200:250]

11356    @jdawsey1 @maggieNYT Gawd, from Bill Clinton to Sarah Huckabee Sanders is watching the opposite of evolution in real time.                                                                                                                                                                                               
11435    @omwill3 @GovMattBevin What is with these southern governors pardoning people who go on to do awful shit? Huckabee pardoned a murderer in his state who came up to mine and killed 4 police officers in an ambush attack. What is going on down there?                                                                   
11446    @dassanir Not only is that the most IDIOTIC thing I’ve ever heard, but come the fuck on y’all. Kayleigh look like she just got done doing the walk of shame &amp; Huckabee..well I guess I’d look like that too if I also had britches full of shit while lying to the free world. https://t.co/CizgcYark9               
11449    @dbinkowski @CoreyRFor

In [36]:
df['text'][df.LDA_topic ==4][200:250]

13068    Mike Huckabee: It&amp;#8217;s Day 2 and Biden Is Already Becoming a Disaster for Democrats via @WestJournalism https://t.co/OFW4Ltanjx \nA Puppet, With Strings is Not a President, Biden is Just Obeying Orders From the Deep State, A Contiuation of the Obama Regime!!                           
13085    Huckabee: Democrats want opposing voices to 'disappear' because they're 'not that confident in their views' |  https://t.co/b22zxjqMDU                                                                                                                                                              
13089    @BrandonStraka When insecurity and low self esteem becomes the Trumper business model. Kayleigh’s no Sarah Huckabee though.                                                                                                                                                                         
13107    Joe Biden was sworn in as the 46th president of the United States. \n\nWhat has Biden