In [1]:
import os
import sys
# Allow this notebook to import local libraries 
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from topic_modelling.preprocessor_all import load_data
from topic_modelling.pipelines import basic_pipeline, spacy_pipeline
from topic_modelling.preprocessor_all import load_data
from topic_modelling.models import BasicModel, HierarchicalModel, EnsembleModel, NMFModel

import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

import warnings
warnings.filterwarnings('ignore')

# Table of contents
* [1. Load and preprocess data](#1)
* [2. Date-preprocessing](#2)
    * [2.1 LDA Basic Mode](#2.1)
        * [2.1.1 Hyper-parameter Tuning](#2.1.1)
        * [2.1.2 Find best number of topics](#2.1.2)
        * [2.1.3 Grid Search](#2.1.3)
    * [2.2 NMF](#2.2)
    * [2.3 Ensemble](#2.3)
    

## 1. Load and preprocess data <a class="anchor" id="1"></a>

In [3]:
df = load_data()
df = spacy_pipeline.apply(df, column='cleanBody')

Func:load_data took: 0.48 seconds
Func:reset_index took: 0.01 seconds


Pandas Apply:   0%|          | 0/42368 [00:00<?, ?it/s]

Func:demoji_preprocessor took: 19.85 seconds


Pandas Apply:   0%|          | 0/42368 [00:00<?, ?it/s]

Func:tweet_preprocessor took: 1.20 seconds
:: Spacy preprocessor -> cleaning, this might take 1-2 minutes....


Pandas Apply:   0%|          | 0/42368 [00:00<?, ?it/s]

Func:spacy_preprocessor took: 56.98 seconds


Pandas Apply:   0%|          | 0/42368 [00:00<?, ?it/s]

Func:predefined_denoiser took: 0.09 seconds
Func:drop_empty took: 0.02 seconds
Func:reset_index took: 0.00 seconds


Pandas Apply:   0%|          | 0/42360 [00:00<?, ?it/s]

Func:tokenizer_transformer took: 0.36 seconds


Pandas Apply:   0%|          | 0/42360 [00:00<?, ?it/s]

Func:ngrammer_2_3 took: 0.57 seconds


# 2. Model Selection<a class="anchor" id="2"></a>

## 2.1 LDA Basic Model<a class="anchor" id="2.1"></a>

In [4]:
basic_model = BasicModel()
basic_model.fit(df,'cleanBody')
basic_model.train(num_topics=6, 
                  passes=1,
                  chunksize=100,
                  eval_every=10,
                  alpha = 0.9,
                  decay=0.3,
                  tfidf = False
                 )

print(basic_model.get_coherance())

:: Size of id2word: 25387
Func:createid2word_dictionary took: 0.21 seconds
Func:filter_extremes took: 0.02 seconds
Func:create_bow_coprpus took: 0.18 seconds
Func:create_tfidf_corpus took: 0.03 seconds
----> Training BasicModel <----
Func:train took: 3.77 seconds

Coherence Score:  0.630900517118667
Func:get_coherance took: 2.66 seconds
0.630900517118667


## 2.1 LDA Tfidf Model<a class="anchor" id="2.1"></a>

In [25]:
lda_tfidf_model = BasicModel()
lda_tfidf_model.fit(df,'cleanBody')
lda_tfidf_model.train(num_topics=6, 
                  passes=1,
                  chunksize=500,
                  eval_every=10,
                  alpha = 'symmetric',
                  decay=0.1,
                  tfidf = True
                 )

print(lda_tfidf_model.get_coherance())

:: Size of id2word: 25387
Func:createid2word_dictionary took: 0.25 seconds
Func:filter_extremes took: 0.02 seconds
Func:create_bow_coprpus took: 0.10 seconds
Func:create_tfidf_corpus took: 0.03 seconds
----> Training BasicModel <----
Func:train took: 4.16 seconds

Coherence Score:  0.5160009101653033
Func:get_coherance took: 2.68 seconds
0.5160009101653033


### 2.1.1 Hyper-parameter Tuning<a class="anchor" id="2.1.1"></a>

### 2.1.2 Find best number of topics<a class="anchor" id="2.1.2"></a>

In [None]:
model_list, coherence_values = basic_model.compute_coherence_values(start=6, limit=18, step=1, plot=True)

In [None]:
for m,c in (zip(model_list, coherence_values)):
    print(f"Topics:{m.num_topics},Coherence score:{c}")

### 2.1.3 Grid Search<a class="anchor" id="2.1.3"></a>

In [21]:
import numpy as np
import tqdm
import gensim
import pandas as pd


def grid_search(df:pd.DataFrame,  k_start=5, k_end=12, k_step=1,
                alpha_start=0.2, alpha_end=1, alpha_step=0.2,
                decay_start=0.1, decay_end=0.5, decay_step=0.1
               )-> pd.DataFrame:

    """
    Create several models using different ranges for Alpha, Betta and Number of topics
    Part of the code was retrieved from : https://towardsdatascience.com/evaluate-topic-model-in-python-latent-dirichlet-allocation-lda-7d57484bb5d0
    """

    grid = {}

    # Topics
    topics_range = range(k_start, k_end, k_step)
    # ALPHA
    alpha_range = list(np.arange(alpha_start, alpha_end, alpha_step)) 
    alpha_range.append('symmetric')
    alpha_range.append('asymmetric')
    # Decay
    decay_range = list(np.arange(decay_start, decay_end, decay_step))
    
    chunksize_range = [100,300,500]
    passes_range = [1]
    model_results = {
                     'Topics': [],
                     'Alpha': [],
                     'Decay': [],
                     'Chunksize': [],
                     'Passes': [],
                     'Coherence': []
                    }

    total_combinations = len(topics_range)*len(alpha_range)*len(decay_range)*len(chunksize_range)*len(passes_range)
    print(f"Total Models to grid search: {total_combinations}")

    with tqdm.tqdm(total=total_combinations)as pbar:

        # iterate through number of topics
        for k in topics_range:
            # iterate through alpha values
            for a in alpha_range:
                # iterare through decay values
                for d in decay_range:
                    # iterare through chunk sizes
                    for c in chunksize_range:
                        # iterare through chunk sizes
                        for p in passes_range:
                            # get the coherence score for the given parameters
                            cv = basic_model.compute_coherence_for_topics_a_d(k=k, a=a, d=d, c=c, p=p, tfidf=True)
                            # Save the model results
                            model_results['Topics'].append(k)
                            model_results['Alpha'].append(a)
                            model_results['Decay'].append(d)
                            model_results['Chunksize'].append(c)
                            model_results['Passes'].append(p)
                            model_results['Coherence'].append(cv)
                            pbar.update(1)

        grid_search_results = pd.DataFrame(model_results).to_csv('gridSearch_results2.csv', index=False)
        pbar.close()
    
    return grid_search_results




### Run grid search 

In [23]:
grid_search_results = grid_search(
    df.cleanBody.to_list(),
    k_start=4, k_end=8, k_step=1,
    alpha_start=0.5, alpha_end=1, alpha_step=0.1,
    decay_start=0.1, decay_end=0.3, decay_step=0.1,
)

Total Models to grid search: 168


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 168/168 [18:12<00:00,  6.50s/it]


### Load Previous Grid Search results

### Bow LDA gridsearch

In [14]:
grid_search_results = pd.read_csv('gridSearch_results.csv')
grid_search_results.sort_values(by='Coherence', ascending= False).head(5)

Unnamed: 0,Topics,Alpha,Decay,Chunksize,Passes,Coherence
900,9,0.61,0.3,1500,1,0.597776
675,8,0.91,0.3,1500,1,0.597586
1440,11,0.61,0.3,1500,1,0.597038
1170,10,0.61,0.3,1500,1,0.595427
945,9,0.91,0.3,1500,1,0.594083


### TFIDF LDA gridsearch

In [24]:
grid_search_results = pd.read_csv('gridSearch_results2.csv')
grid_search_results.sort_values(by='Coherence', ascending= False).head(10)

Unnamed: 0,Topics,Alpha,Decay,Chunksize,Passes,Coherence
24,4,0.8999999999999999,0.1,100,1,0.586891
98,6,0.7,0.1,500,1,0.585723
14,4,0.7,0.1,500,1,0.582723
104,6,0.7999999999999999,0.1,500,1,0.582262
102,6,0.7999999999999999,0.1,100,1,0.581297
62,5,0.7999999999999999,0.1,500,1,0.573272
50,5,0.6,0.1,500,1,0.569391
68,5,0.8999999999999999,0.1,500,1,0.568744
108,6,0.8999999999999999,0.1,100,1,0.567316
110,6,0.8999999999999999,0.1,500,1,0.564067


### 2.1.4 Hierarchical Dirichlet Process<a class="anchor" id="2.1.4"></a>

In [None]:
hdpModel = HierarchicalModel()
hdpModel.fit(df,'cleanBody')
hdpModel.train()

print(hdpModel.get_coherance())

In [None]:
print(hdpModel.model.show_topics(num_topics=8, formatted=True))

In [None]:
import pandas as pd

def topic_prob_extractor(gensim_hdp):
    shown_topics = gensim_hdp.show_topics(num_topics=30, num_words=10, formatted=False)
    topics_nos = [x[0] for x in shown_topics ]
    weights = [ sum([item[1] for item in shown_topics[topicN][1]]) for topicN in topics_nos ]

    return pd.DataFrame({'topic_id' : topics_nos, 'weight' : weights})

topic_prob_extractor(hdpModel.model)

In [None]:
import warnings
warnings.filterwarnings('ignore')

## 2.2 NMF<a class="anchor" id="2.2"></a>


In [None]:
nmf_model = NMFModel()
nmf_model.fit(df, 'cleanBody')
nmf_model.train(num_topics=10)

print(nmf_model.get_topics())
print(nmf_model.get_coherance())

## 2.3 Ensemble<a class="anchor" id="2.3"></a>
https://radimrehurek.com/gensim/models/ensemblelda.html

In [None]:
ensemble_model = EnsembleModel()
ensemble_model.fit(df, 'cleanBody')
ensemble_model.train(num_topics=10, num_models=10)

print(ensemble_model.get_topics())
print(ensemble_model.get_coherance())

# 3. Topic Analysis<a class="anchor" id="3"></a>


## 3.1 LDA Model<a class="anchor" id="3.1"></a>


In [6]:
vis = gensimvis.prepare(topic_model=basic_model.model, corpus=basic_model.corpus, dictionary=basic_model.id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

## 3.2 LDA Tfidf Mode

In [26]:
vis = gensimvis.prepare(topic_model=lda_tfidf_model.model, corpus=lda_tfidf_model.corpus_tfidf, dictionary=basic_model.id2word)
pyLDAvis.enable_notebook()
pyLDAvis.display(vis)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


# 4. Model Selection and Analysis
Based on Grid Search we identify that 6,7,8 total topics yield high coherance. However we can see that when have 8 topics, some clusters are overlapping. So I chose 6 so we can have distinc clusters.

