In [None]:
import pandas as pd
import numpy as np
import glob
import gensim
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import tqdm
import nltk
import re
from operator import itemgetter
import time
from pprint import pprint
from bertopic import BERTopic

EPOCHS = 205
TOPICS = 5
CHUNK_SIZE = 1000
WORKERS = 7
EVAL_PERIOD = 10
ALPHA = 0.01
BETA = 0.9

In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

In [None]:
def preprocess_text(sentence):
    # Lowercase
    sentence = sentence.lower()
    
    # Remove all non-alphabets (punctuation, numbers, new-line characters and extra-spaces)
    sentence = re.sub(r'[^a-zA-Z]+', ' ', sentence)
    sentence = sentence.replace('\n', '')
    # sentence = re.sub('\s\s+', ' ', sentence)
    
    # Tokenize & remove stop-words
    word_list = nltk.word_tokenize(sentence)    
    stopwords_list = nltk.corpus.stopwords.words('english')
    stopwords_list.extend(['trump','realdonaldtrump','thank','trump','presid','america','american','fjv'])
    word_list = [word for word in word_list if word not in stopwords_list]
    
    # Remove very small words, length < 3, they don't contribute any useful information
    word_list = [word for word in word_list if len(word) > 3]
        
    # Stem & Lemmatize
    porter_stemmer = nltk.stem.PorterStemmer()
    lemmatizer = nltk.stem.WordNetLemmatizer()
    word_list = [porter_stemmer.stem(word) for word in word_list]
    word_list = [lemmatizer.lemmatize(word) for word in word_list]
    
    sentence = ' '.join(word_list)
    
    return sentence

## Load Data

In [None]:
path = '../../data/twitter/raw/users/'
republicans_df = pd.concat([pd.read_csv(f) for f in glob.glob(path+'republicans/required/*.csv')])
democrats_df = pd.concat([pd.read_csv(f) for f in glob.glob(path+'democrats/required/*.csv')])

df = pd.concat([republicans_df, democrats_df], ignore_index=True)

In [None]:
tqdm.tqdm.pandas()
df['tweet_tokenized'] = df['tweet'].progress_apply(lambda x:preprocess_text(str(x)))

In [None]:
performance_metrics = pd.DataFrame(columns=['feature-extraction','clustering-algo', 'run#', 'state', 'c_v','c_umass','topics','time'])

## TF-IDF

In [None]:
documents = df['tweet_tokenized'].str.split()
dictionary = gensim.corpora.Dictionary(documents)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=20000)

tfidf_model = gensim.models.TfidfModel(dictionary=dictionary)

corpus = [dictionary.doc2bow(document) for document in documents]

corpus_tfidf = list(tfidf_model[corpus])

### LDA

In [None]:
for run, state in zip(range(2, 6, 1), range(4, 12, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_lda = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE, passes=EPOCHS, 
                                                     eval_every = EVAL_PERIOD, per_word_topics=True, random_state=state, alpha=ALPHA, eta=BETA)
    
    coherence_cv = gensim.models.CoherenceModel(model=gensim_lda, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_cumass = gensim.models.CoherenceModel(model=gensim_lda, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()
    topics = gensim_lda.print_topics()
    
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'LDA', 'run#':run, 'state':state,'c_v':coherence_cv,'c_umass':coherence_cumass,
                                                      'topics':topics,'time':(stop-start)}, ignore_index=True)

### Parallel LDA

In [None]:
for run, state in zip(range(1, 6, 1), range(2, 12, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_plda = gensim.models.ldamulticore.LdaMulticore(corpus=corpus_tfidf, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE, workers=WORKERS, passes=EPOCHS, 
                                                     eval_every = EVAL_PERIOD, per_word_topics=True, random_state=state, alpha=ALPHA, eta=BETA)
    
    coherence_cv = gensim.models.CoherenceModel(model=gensim_plda, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_cumass = gensim.models.CoherenceModel(model=gensim_plda, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()
    topics = gensim_plda.print_topics()
    
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'Parallel LDA', 'run#':run, 'state':state,'c_v':coherence_cv,'c_umass':coherence_cumass,
                                                      'topics':topics,'time':(stop-start)}, ignore_index=True)
    

### NMF

In [None]:
for run, state in zip(range(1, 6, 1), range(2, 12, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_nmf = gensim.models.Nmf(corpus=corpus_tfidf, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE, passes=EPOCHS, eval_every=EVAL_PERIOD, minimum_probability=0, 
                                   random_state=state, kappa=1)
    
    coherence_cv = gensim.models.CoherenceModel(model=gensim_nmf, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_cumass = gensim.models.CoherenceModel(model=gensim_nmf, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()
    topics = gensim_nmf.print_topics()
    
    stop = time.time()
    
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'NMF', 'run#':run, 'state':state,'c_v':coherence_cv,'c_umass':coherence_cumass,
                                                      'topics':topics,'time':(stop-start)}, ignore_index=True)

### LSI

In [None]:
for run, state in zip(range(1, 6, 1), range(2, 12, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_lsi = gensim.models.LsiModel(corpus=corpus_tfidf, num_topics=TOPICS, id2word=dictionary, chunksize=CHUNK_SIZE)
    
    coherence_cv = gensim.models.CoherenceModel(model=gensim_lsi, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_cumass = gensim.models.CoherenceModel(model=gensim_lsi, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()
    topics = gensim_lsi.print_topics()
    stop = time.time()
      
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'LSI', 'run#':run, 'state':state,'c_v':coherence_cv,'c_umass':coherence_cumass,
                                                      'topics':topics,'time':(stop-start)}, ignore_index=True)

### BERTopic

In [None]:
# for run, state in zip(range(1, 6, 1), range(2, 12, 2)):
#     print('Run #', run)
    
#     start = time.time()
#     tfidf_embeddings = TfidfVectorizer(min_df=5).fit_transform(df['tweet_tokenized'].astype(str))
#     topic_model = BERTopic(verbose=True, nr_topics=5)
#     topics, probs = topic_model.fit_transform(df['tweet_tokenized'].astype(str), tfidf_embeddings)    
    
#     coherence_cv = gensim.models.CoherenceModel(model=topic_model, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
#     coherence_cumass = gensim.models.CoherenceModel(model=topic_model, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()
#     topics = (topic_model.get_topic_info()[1:]['Name']).tolist()
#     # print(topics)
#     stop = time.time()
    
#     performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'BERTopic', 'run#':run, 'state':state,'c_v':coherence_cv,'c_umass':coherence_cumass,
#                                                       'topics':topics,'time':(stop-start)}, ignore_index=True)

### HDP

In [None]:
for run, state in zip(range(1, 6, 1), range(2, 12, 2)):
    print('Run #', run)
    
    start = time.time()
    gensim_hdp = gensim.models.hdpmodel.HdpModel(corpus=corpus_tfidf, id2word=dictionary, chunksize=CHUNK_SIZE, random_state=state, kappa=1, alpha=ALPHA)
    
    coherence_cv = gensim.models.CoherenceModel(model=gensim_hdp, texts=documents, dictionary=dictionary, coherence='c_v').get_coherence()
    coherence_cumass = gensim.models.CoherenceModel(model=gensim_hdp, texts=documents, dictionary=dictionary, coherence='u_mass').get_coherence()
    topics = gensim_hdp.print_topics()
    stop = time.time()
      
    performance_metrics = performance_metrics.append({'feature-extraction':'tf-idf', 'clustering-algo':'HDP', 'run#':run, 'state':state,'c_v':coherence_cv,'c_umass':coherence_cumass,
                                                      'topics':topics,'time':(stop-start)}, ignore_index=True)

In [None]:
performance_metrics

In [None]:
mean_perf = performance_metrics.groupby('clustering-algo')[['c_v','c_umass','time']].mean()

In [None]:
mean_perf.to_csv('../../results/topic-modelling/mean-perf-topic-modelling.csv')