# Configuration Variables

In [1]:
ENABLE_COLAB=True
USE_GPU=True
GENERATE_PROFILE = False

#Set MAX_FILE_TO_LOAD between 1 and 13 only (no error checking outside of that)
MAX_FILE_TO_LOAD=13
TWEETS_FOR_ANALYSIS=60000

ENABLE_PLOTS=True

CONVERT_COLUMNS=True


#Data Extraction Config
ENABLE_MANUAL_SELECTION=True
DATA_START_YEAR=2017
DATA_END_YEAR=2017
MAX_TWEETS_PER_MONTH=3000

NUMBER_TOPICS=10
MAX_TWEET_ANALYZE_PER_TOPIC=4

#Data Analyzed Output
CSV_OUTPUT_ROOT="/content/gdrive/MyDrive/ML1000/Project_3/"
CSV_OUTPUT_FILENAME="tweetAnalysisOutputMike.csv"
CSV_INPUT_FILENAME="tweetAnalysisOutputMike_classified.csv"



In [2]:
if ENABLE_COLAB:
  DATA_FILE_ROOT_PATH='/content/gdrive/MyDrive/ML1000/Project_3/TrumpTweets/'
  PANDA_PROFILE_OUTPUT_ROOT='/content/gdrive/MyDrive/ML1000/Project_3/TrumpTweets/'

  SAVE_MODEL_ROOT='/content/gdrive/MyDrive/Colab Notebooks/ML1000_Project_3/'
else:
  DATA_FILE_ROOT_PATH='./data/'
  PANDA_PROFILE_OUTPUT_ROOT='./data/'

  SAVE_MODEL_ROOT='./model_saves/'


# Environment

In [3]:
if ENABLE_COLAB:
  !pip install pycaret -q
  !pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip -q
  !pip install matplotlib -q
  !pip install wordcloud
  !pip install --upgrade gensim
else:
  display('Google Colab not enabled')

[K     |████████████████████████████████| 24.2 MB 111.5 MB/s 
Collecting gensim
  Using cached gensim-4.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
Installing collected packages: gensim
  Attempting uninstall: gensim
    Found existing installation: gensim 3.8.3
    Uninstalling gensim-3.8.3:
      Successfully uninstalled gensim-3.8.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pycaret 2.3.4 requires gensim<4.0.0, but you have gensim 4.1.2 which is incompatible.[0m
Successfully installed gensim-4.1.2


In [5]:
import nltk
nltk.download('stopwords')

# spacy for lemmatization
import spacy

!python -m spacy download en_core_web_sm 
!python -m textblob.download_corpora
sp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.1 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downl

In [6]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel



# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

  from collections import Iterable
  from collections import Mapping


In [7]:
if ENABLE_COLAB:
  from pycaret.utils import enable_colab
  enable_colab()
else:
  display('Google Colab not enabled')

Colab mode enabled.


In [8]:
if ENABLE_COLAB: 
  from google.colab import drive
  drive.mount('/content/gdrive', force_remount=True)
else:
  display('Google Colab not enabled')

Mounted at /content/gdrive


# Data Loading and Display

In [9]:
FILE_TO_BE_READ = DATA_FILE_ROOT_PATH + "realdonaldtrump.csv"
data_orig = pd.read_csv(FILE_TO_BE_READ, low_memory=False)

print(data_orig.shape)
print(data_orig.info())

(43352, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43352 entries, 0 to 43351
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         43352 non-null  int64 
 1   link       43352 non-null  object
 2   content    43352 non-null  object
 3   date       43352 non-null  object
 4   retweets   43352 non-null  int64 
 5   favorites  43352 non-null  int64 
 6   mentions   20386 non-null  object
 7   hashtags   5583 non-null   object
dtypes: int64(3), object(5)
memory usage: 2.6+ MB
None


In [10]:
data_orig.head(5)

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,


In [11]:
 #convert date column to a real date
data_orig['date'] = pd.to_datetime(data_orig['date'])

# Data Profile Creation

In [12]:
if GENERATE_PROFILE:
  profiles = ProfileReport(data_orig, title="Trump Tweets", html={'style': {'full_width': True}})
  profiles.to_file(output_file= PANDA_PROFILE_OUTPUT_ROOT + "Trump_Tweets_Data_Profile.html")

else:
  display("Generate profile is off")

'Generate profile is off'

# Data Selection

In [13]:
data_clean = data_orig

In [14]:
#Extract X number of tweets per year/month combo

#Need a formatted data frame that is empty
data_sampled = data_clean.loc[[1], :]
data_sampled.drop(data_sampled.index, inplace=True)
display("Empty dataframe: " + str(data_sampled.shape))


if ENABLE_MANUAL_SELECTION:
  data_sampled = data_clean.loc[
                                (data_clean['date'].dt.year == 2016)  & 
                                (
                                  (data_clean['date'].dt.month == 6) |
                                  (data_clean['date'].dt.month == 7) |
                                  (data_clean['date'].dt.month == 8) |
                                  (data_clean['date'].dt.month == 9) |
                                  (data_clean['date'].dt.month == 10)
                                )
                              ]


else:
  indexYear = DATA_START_YEAR

  while indexYear <= DATA_END_YEAR:
    indexMonth = 1
    while indexMonth <= 12:
      display("Working on Year-Month: " + str(indexYear) + "-" + str(indexMonth))
      #get data from frame
      tempFrame = data_clean.loc[
                                (data_clean['date'].dt.year == indexYear)  &
                                (data_clean['date'].dt.month == indexMonth)
                              ]
      #sample amount per month
      if len(tempFrame.index > 0):
        tempFrame = tempFrame.sample(min(MAX_TWEETS_PER_MONTH,len(tempFrame.index)))

      #append data
      data_sampled = data_sampled.append(tempFrame)
      display("data_sampled size: " + str(data_sampled.shape))
      indexMonth += 1
    indexYear += 1
data_sampled.reset_index(inplace=True, drop=True)
data_sampled.info()

'Empty dataframe: (0, 8)'

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1653 entries, 0 to 1652
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   id         1653 non-null   int64         
 1   link       1653 non-null   object        
 2   content    1653 non-null   object        
 3   date       1653 non-null   datetime64[ns]
 4   retweets   1653 non-null   int64         
 5   favorites  1653 non-null   int64         
 6   mentions   405 non-null    object        
 7   hashtags   551 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(4)
memory usage: 103.4+ KB


# Data Cleaning and Prep

In [15]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['www', 'twitter', 're', 'edu', 'use'])

In [16]:
#removes @usertag
data_sampled['content'] = data_sampled.content.replace(to_replace=r'(#|@).?(?=\s|$)',regex=True, value="")

#removes http://
data_sampled['content'] = data_sampled.content.replace(to_replace=r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b',regex=True, value="")


# Convert to list
data = data_sampled.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove @usernames
data = [re.sub('@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:3])

['So I raised/gave $5,600,000 for the veterans and the media makes me look '
 'bad! They do anything to belittle - totally biased.',
 'Join me in San Jose, California- tomorrow evening at 7pm! '
 'MakeAmericaGreatAgain Trump2016 pic.twitter.com/Uds8OdH3A4',
 'Crooked Hillary Clinton is a fraud who has put the public and country at '
 'risk by her illegal and very stupid use of e-mails. Many missing!']


  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  data = [re.sub('@\S*\s?', '', sent) for sent in data]
  data = [re.sub('\s+', ' ', sent) for sent in data]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [17]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:3])

[['so', 'raised', 'gave', 'for', 'the', 'veterans', 'and', 'the', 'media', 'makes', 'me', 'look', 'bad', 'they', 'do', 'anything', 'to', 'belittle', 'totally', 'biased'], ['join', 'me', 'in', 'san', 'jose', 'california', 'tomorrow', 'evening', 'at', 'pm', 'trump', 'pic', 'twitter', 'com', 'uds', 'odh'], ['crooked', 'hillary', 'clinton', 'is', 'fraud', 'who', 'has', 'put', 'the', 'public', 'and', 'country', 'at', 'risk', 'by', 'her', 'illegal', 'and', 'very', 'stupid', 'use', 'of', 'mails', 'many', 'missing']]


In [18]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[2]]])

['crooked', 'hillary', 'clinton', 'is', 'fraud', 'who', 'has', 'put', 'the', 'public', 'and', 'country', 'at', 'risk', 'by', 'her', 'illegal', 'and', 'very', 'stupid', 'use', 'of', 'mails', 'many', 'missing']


In [19]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [20]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

#Skip the lemmatization. keeps killing proper names
# Do lemmatization keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_words_bigrams[:3])

  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):
  for entry_point in AVAILABLE_ENTRY_POINTS.get(self.entry_point_namespace, []):


[['raised', 'gave', 'veterans', 'media', 'makes', 'look', 'bad', 'anything', 'belittle', 'totally', 'biased'], ['join', 'san_jose', 'california', 'tomorrow', 'evening', 'pm', 'trump', 'pic', 'com', 'uds', 'odh'], ['crooked', 'hillary', 'clinton', 'fraud', 'put', 'public', 'country', 'risk', 'illegal', 'stupid', 'mails', 'many', 'missing']]


In [21]:
print(data_words_bigrams[:3])

[['raised', 'gave', 'veterans', 'media', 'makes', 'look', 'bad', 'anything', 'belittle', 'totally', 'biased'], ['join', 'san_jose', 'california', 'tomorrow', 'evening', 'pm', 'trump', 'pic', 'com', 'uds', 'odh'], ['crooked', 'hillary', 'clinton', 'fraud', 'put', 'public', 'country', 'risk', 'illegal', 'stupid', 'mails', 'many', 'missing']]


In [22]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]]


In [23]:
#what word does a given ID correspond to
id2word[3]

'biased'

In [24]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:3]]

[[('anything', 1),
  ('bad', 1),
  ('belittle', 1),
  ('biased', 1),
  ('gave', 1),
  ('look', 1),
  ('makes', 1),
  ('media', 1),
  ('raised', 1),
  ('totally', 1),
  ('veterans', 1)],
 [('california', 1),
  ('com', 1),
  ('evening', 1),
  ('join', 1),
  ('odh', 1),
  ('pic', 1),
  ('pm', 1),
  ('san_jose', 1),
  ('tomorrow', 1),
  ('trump', 1),
  ('uds', 1)],
 [('clinton', 1),
  ('country', 1),
  ('crooked', 1),
  ('fraud', 1),
  ('hillary', 1),
  ('illegal', 1),
  ('mails', 1),
  ('many', 1),
  ('missing', 1),
  ('public', 1),
  ('put', 1),
  ('risk', 1),
  ('stupid', 1)]]

# Data Setup

In [25]:
# Build LDA model
lda = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                      id2word=id2word,
                                      num_topics=40, 
                                      random_state=100,
                                      update_every=1,
                                      chunksize=300,
                                      passes=25,
                                      alpha='auto',
                                      per_word_topics=True)

untuned_lda = lda
print(lda)

LdaModel(num_terms=4612, num_topics=40, decay=0.5, chunksize=300)


In [26]:
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=data_words_bigrams, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -20.68664854485746

Coherence Score:  0.46278834169154426


In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, id2word)
vis

In [28]:
from wordcloud import WordCloud

for t in range(lda.num_topics):
    plt.figure()
    plt.imshow(WordCloud().fit_words(dict(lda.show_topic(t, 200))))
    plt.axis("off")
    plt.title("Topic #" + str(t))
    plt.show()

Output hidden; open in https://colab.research.google.com to view.

# Scratchpad

In [29]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [31]:
# Can take a long time to run.
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_words_bigrams, start=2, limit=40, step=6)

AttributeError: ignored

In [65]:
def format_topics_sentences(ldamodel=lda, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(10)

NameError: ignored

In [50]:
lda.top_topics

<bound method LdaModel.top_topics of <gensim.models.ldamodel.LdaModel object at 0x7f1f187883d0>>