# 2020 Elections: Topic Modelling Test with Gensim

## Useful Libraries

JupyterNotify allows us to know when a cell has finished running. All we need to do is inserting '%%notify' at the beginning of the cell.

In [62]:
!pip install jupyternotify

Collecting jupyternotify
  Downloading jupyternotify-0.1.15.tar.gz (7.2 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: jupyternotify
  Building wheel for jupyternotify (setup.py) ... [?25ldone
[?25h  Created wheel for jupyternotify: filename=jupyternotify-0.1.15-py3-none-any.whl size=8725 sha256=ba3fe4fd50e9c2ee1d92da8993c443048a3b687eacbd0c6ae448c90658d39d9d
  Stored in directory: /Users/matteo-stelluti/Library/Caches/pip/wheels/db/f4/43/06c94fe0f5bacf0029ea8ebb8d080f372b97661740be7b3d74
Successfully built jupyternotify
Installing collected packages: jupyternotify
Successfully installed jupyternotify-0.1.15


In [2]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

## Data Load

In [3]:
import pandas as pd

In [4]:
dft = pd.read_csv(YOUR FILE PATH, lineterminator='\n')

In [5]:
dfb = pd.read_csv(YOUR FILE PATH, lineterminator='\n')

In [6]:
df = pd.concat([dft,dfb],ignore_index=True)

In [7]:
###TEST 

df = pd.concat([dft[:2000],dfb[:2000]],ignore_index=True)

# VERSION 1: TWEETS TOGETHER

## Pre-Processing

In [8]:
import nltk
import re

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/matteo-
[nltk_data]     stelluti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/matteo-
[nltk_data]     stelluti/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/matteo-
[nltk_data]     stelluti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/matteo-
[nltk_data]     stelluti/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

### Tokenization and Cleaning

In [10]:
def tweet_preprocess(tweet):
    # lowercase
    tweet = tweet.lower()
    
    #keep only alphabets
    tweet = re.sub(r'[^a-zA-Z]+', ' ', tweet)
    tweet = tweet.replace('\n', '')
    
    #tokenization
    word_list = nltk.word_tokenize(tweet)    
    stopwords_list = nltk.corpus.stopwords.words('english')
    stopwords_list.extend(['trump','realdonaldtrump','thank','trump','presid','america','american','fjv'])
    word_list = [word for word in word_list if word not in stopwords_list]
    
    #small words removal
    word_list = [word for word in word_list if len(word)>3]
    
    #stemmer and lemmatizer
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    tweet = ' '.join(word_list)
    
    return tweet

In [11]:
import tqdm

In [12]:
tqdm.tqdm.pandas()
df['tweet_tokenized'] = df['tweet'].progress_apply(lambda x:tweet_preprocess(str(x)))

100%|█████████████████████████████████████| 4000/4000 [00:02<00:00, 1386.02it/s]


In [13]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','clustering-algo','c_v','c_umass','topics'])

### TF-IDF

In [14]:
import gensim

In [15]:
documents = df['tweet_tokenized'].str.split()

In [16]:
print(documents[:5])

0    [eleccion, florida, joebiden, dice, donaldtrum...
1    [contro, facebook, twitter, coprono, biden, do...
2    [student, use, hear, year, year, heard, china,...
3    [hour, sinc, last, tweet, mayb, busi, tremend,...
4                        [ralli, iowa, http, jjaluumh]
Name: tweet_tokenized, dtype: object


In [17]:
dictionary = gensim.corpora.Dictionary(documents)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=20000)

tfidf = gensim.models.TfidfModel(dictionary=dictionary)

corpus = [dictionary.doc2bow(document) for document in documents]

corpus_tfidf = list(tfidf[corpus])

## Topics Modelling

### Define HyperParameters

In [20]:
EPOCHS = 205
TOPICS = 10
CHUNK_SIZE = 1000
WORKERS = 7
EVAL_PERIOD = 10
ALPHA = 0.01
BETA = 0.9

### Algo 1: LDA - Latent Dirichlet Allocation

In [19]:
%%notify

lda = gensim.models.ldamodel.LdaModel(
    corpus = corpus_tfidf,
    num_topics = 10,
    id2word = dictionary,
    chunksize=CHUNK_SIZE, passes=EPOCHS, 
    eval_every = EVAL_PERIOD, 
    per_word_topics=True
    )

<IPython.core.display.Javascript object>

In [21]:
topics_lda = lda.print_topics()

In [22]:
print(topics_lda)

[(0, '0.014*"town" + 0.014*"hall" + 0.013*"vote" + 0.012*"bidenharri" + 0.011*"joebiden" + 0.009*"maddow" + 0.009*"care" + 0.009*"democrat" + 0.009*"introduc" + 0.009*"unit"'), (1, '0.036*"covid" + 0.018*"coronaviru" + 0.014*"barron" + 0.014*"potu" + 0.012*"donald" + 0.012*"para" + 0.011*"never" + 0.011*"donaldtrump" + 0.010*"guilti" + 0.009*"youtub"'), (2, '0.016*"iowa" + 0.014*"elect" + 0.012*"decis" + 0.011*"presid" + 0.010*"maga" + 0.009*"ralli" + 0.009*"first" + 0.008*"joebiden" + 0.008*"reason" + 0.008*"show"'), (3, '0.055*"biden" + 0.029*"vote" + 0.019*"boycottnbc" + 0.018*"covid" + 0.018*"watch" + 0.017*"bidentownhal" + 0.016*"bidenharristosaveamerica" + 0.014*"votebluetosaveamerica" + 0.014*"rate" + 0.013*"covidiot"'), (4, '0.037*"burisma" + 0.012*"barackobama" + 0.012*"alreadi" + 0.011*"great" + 0.009*"bombshel" + 0.009*"brag" + 0.009*"realjameswood" + 0.008*"estado" + 0.008*"florida" + 0.008*"admit"'), (5, '0.034*"donaldtrump" + 0.020*"icecub" + 0.015*"need" + 0.014*"trumpco

In [23]:
%%notify

coherence_cv = gensim.models.CoherenceModel(model=lda, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_cumass = gensim.models.CoherenceModel(model=lda, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()

<IPython.core.display.Javascript object>

In [24]:
performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 
                                                  'clustering-algo':'LDA',
                                                  'c_v':coherence_cv,
                                                  'c_umass':coherence_cumass,
                                                  'topics':topics_lda}, 
                                                   ignore_index=True)

  performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf',


### Algo 2: NMF - NonNegative Matrix Factorization

In [25]:
%%notify

nmf = gensim.models.Nmf(corpus=corpus_tfidf, 
                        num_topics=TOPICS, 
                        id2word=dictionary, 
                        chunksize=CHUNK_SIZE, passes=EPOCHS, 
                        eval_every=EVAL_PERIOD, 
                        minimum_probability=0, 
                        kappa=1
                       )

In [26]:
topics_nmf = nmf.print_topics()

In [27]:
print(topics_nmf)

[(0, '0.173*"donaldtrump" + 0.031*"icecub" + 0.018*"year" + 0.009*"donald" + 0.009*"elect" + 0.008*"maga" + 0.008*"twitter" + 0.007*"black" + 0.006*"landslidevictori" + 0.006*"stupid"'), (1, '0.037*"rate" + 0.034*"townhal" + 0.032*"know" + 0.031*"boycottnbc" + 0.031*"time" + 0.030*"watch" + 0.029*"tonight" + 0.029*"boycotttrumptownhal" + 0.029*"bidenharrislandslid" + 0.029*"sponsor"'), (2, '0.053*"kamalaharri" + 0.039*"debat" + 0.038*"gavinnewsom" + 0.038*"kamalaharrisvp" + 0.038*"hunterbidden" + 0.038*"ilhanomar" + 0.038*"gretchenwhitm" + 0.038*"tedwheel" + 0.037*"mikep" + 0.037*"blacklivesmatt"'), (3, '0.107*"covid" + 0.057*"votebluedownballot" + 0.057*"covidiot" + 0.055*"votebluetosaveamerica" + 0.055*"trumpattacksblackwomen" + 0.055*"acbhear" + 0.054*"barrontrump" + 0.054*"gophypocrisi" + 0.052*"trumpisalaughingstock" + 0.047*"bidenharristosaveamerica"'), (4, '0.204*"vote" + 0.029*"liar" + 0.025*"chump" + 0.013*"biden" + 0.012*"corrupt" + 0.011*"kamalaharri" + 0.011*"decis" + 0.011

In [28]:
%%notify

coherence_cv = gensim.models.CoherenceModel(model=nmf, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
coherence_cumass = gensim.models.CoherenceModel(model=nmf, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()

In [29]:
performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 
                                                  'clustering-algo':'NMF',
                                                  'c_v':coherence_cv,
                                                  'c_umass':coherence_cumass,
                                                  'topics':topics_nmf}, 
                                                   ignore_index=True)

  performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf',


### Algo Evaluation

In [30]:
performance_metrics

Unnamed: 0,feature-extraction,clustering-algo,c_v,c_umass,topics
0,tf-idf,LDA,0.430409,-9.852635,"[(0, 0.014*""town"" + 0.014*""hall"" + 0.013*""vote..."
1,tf-idf,NMF,0.491805,-7.589611,"[(0, 0.173*""donaldtrump"" + 0.031*""icecub"" + 0...."


In [31]:
mean_perf = performance_metrics.groupby('clustering-algo')[['c_v','c_umass']].mean()

In [32]:
performance_metrics.to_csv('/Users/matteo-stelluti/Desktop/Assistant Research/2020 Elections/test_performance.csv')

In [33]:
print(mean_perf)

                      c_v   c_umass
clustering-algo                    
LDA              0.430409 -9.852635
NMF              0.491805 -7.589611


## LDA Visualization

__NOTE__: not available for NMF

In [34]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimviz
pyLDAvis.enable_notebook() #This is only needed on Jupyter Notebook

  from imp import reload


### LDA

In [35]:
gensimviz.prepare(lda,corpus,dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


# VERSION 2: BIDEN VS. TRUMP

## Pre-Processing

In [36]:
dft = dft[:2000]

In [37]:
dfb = dfb[:2000]

In [38]:
tqdm.tqdm.pandas()
dft['tweet_tokenized'] = dft['tweet'].progress_apply(lambda x:tweet_preprocess(str(x)))

100%|█████████████████████████████████████| 2000/2000 [00:01<00:00, 1911.86it/s]


In [39]:
tqdm.tqdm.pandas()
dfb['tweet_tokenized'] = dfb['tweet'].progress_apply(lambda x:tweet_preprocess(str(x)))

100%|█████████████████████████████████████| 2000/2000 [00:00<00:00, 2139.31it/s]


In [40]:
documents_t = dft['tweet_tokenized'].str.split()

In [41]:
documents_b = dfb['tweet_tokenized'].str.split()

In [42]:
dictionary_t = gensim.corpora.Dictionary(documents_t)
dictionary_t.filter_extremes(no_below=5, no_above=0.5, keep_n=20000)

tfidf_t = gensim.models.TfidfModel(dictionary=dictionary_t)

corpus_t = [dictionary_t.doc2bow(document) for document in documents_t]

corpus_tfidf_t = list(tfidf_t[corpus_t])

In [43]:
dictionary_b = gensim.corpora.Dictionary(documents_b)
dictionary_b.filter_extremes(no_below=5, no_above=0.5, keep_n=20000)

tfidf_b = gensim.models.TfidfModel(dictionary=dictionary_b)

corpus_b = [dictionary_b.doc2bow(document) for document in documents_b]

corpus_tfidf_b = list(tfidf_b[corpus_b])

## LDA Modelling

### TRUMP

In [44]:
%%notify

lda_t = gensim.models.ldamodel.LdaModel(
    corpus = corpus_tfidf_t,
    num_topics = 10,
    id2word = dictionary_t,
    chunksize=CHUNK_SIZE, passes=EPOCHS, 
    eval_every = EVAL_PERIOD, 
    per_word_topics=True
    )

<IPython.core.display.Javascript object>

In [None]:
topics_lda_t = lda_t.print_topics()

In [45]:
gensimviz.prepare(lda_t,corpus_t,dictionary_t)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


### BIDEN

In [46]:
%%notify

lda_b = gensim.models.ldamodel.LdaModel(
    corpus = corpus_tfidf_b,
    num_topics = 10,
    id2word = dictionary_b,
    chunksize=CHUNK_SIZE, passes=EPOCHS, 
    eval_every = EVAL_PERIOD, 
    per_word_topics=True
    )

<IPython.core.display.Javascript object>

In [47]:
topics_lda_b = lda_b.print_topics()

In [48]:
gensimviz.prepare(lda_b,corpus_b,dictionary_b)

  default_term_info = default_term_info.sort_values(
  from imp import reload
