# Generate Topic Models
Generates the topic models of forum posts with LDA (Latent Dirichlet Allocation)

## Data Sources
- corpus (created with 3-Lemmatize_Text.ipynb)
- dictionary (created with 3-Lemmatize_Text.ipynb)
- lemmatized_text (created with 3-Lemmatize_Text.ipynb)

## Changes
- 2020-09-16: Created
- 2020-09-17: Found topic model with highest coherence and generated dominant topics

## TODO
- Tutorial
 - https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
 - https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python


## Imports

In [20]:
from gensim import corpora, models
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import CoherenceModel
import pandas as pd
import sqlite3

## Functions

In [17]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by the db_file
    :param db_file: database file
    :return: Connection object or None
    """
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as err:
        print(err)
    return conn

In [18]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(sent_topics_df)

## File Locations

In [3]:
p = Path.cwd()
path_parent = p.parents[0]
path_lemma_pkl = path_parent / "clean_data" / "lemmatized_text.pkl"
path_corpus_pkl = path_parent / "clean_data" / "corpus.pkl"
path_dictionary_gensim = path_parent / "clean_data" / "dictionary.gensim"
path_model = path_parent / "clean_data"
path_db = path_parent / "database" / "youbemomTables.db"
path_db = str(path_db)

## Load Data

In [4]:
lemmatized_text = pickle.load(open(path_lemma_pkl, 'rb'))

In [5]:
corpus = pickle.load(open(path_corpus_pkl, 'rb'))

In [6]:
dictionary = corpora.Dictionary.load(str(path_dictionary_gensim))

## Perform LDA
Identify the model with the highest coherence (model with 7 topics here)

In [7]:
lda_models = {}
NUM_WORDS = 7
n_topics = [3, 5, 7, 9, 11, 20]
for i in n_topics:
    n = "model_LDA_" + str(i)
    fn = n + ".gensim"
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics = i, id2word=dictionary, passes=15)
    path_model_i = path_model / fn
    ldamodel.save(str(path_model_i))
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    perplexity = ldamodel.log_perplexity(corpus)
    coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    lda_models[n] = {'model' : ldamodel, 'coherence' : coherence, 'perplexity': perplexity}
    print("LDA with {} topics".format(i))
    for topic in topics:
        print(topic)
    print("Coherence: {}, Perplexity {}".format(coherence, perplexity))
    print("\n")

LDA with 3 topics
(0, '0.012*"like" + 0.011*"kid" + 0.011*"get" + 0.010*"ds" + 0.008*"help" + 0.008*"know" + 0.008*"try"')
(1, '0.022*"np" + 0.012*"yes" + 0.009*"op" + 0.008*"get" + 0.007*"child" + 0.006*"issue" + 0.006*"adhd"')
(2, '0.046*"school" + 0.020*"kid" + 0.013*"need" + 0.011*"get" + 0.010*"sn" + 0.009*"dc" + 0.009*"teacher"')
Coherence: 0.43449722431191273, Perplexity -7.673853372770869


LDA with 5 topics
(0, '0.063*"school" + 0.023*"kid" + 0.017*"need" + 0.012*"dc" + 0.011*"sn" + 0.011*"get" + 0.009*"private"')
(1, '0.016*"would" + 0.012*"therapist" + 0.011*"get" + 0.010*"years" + 0.009*"pay" + 0.008*"child" + 0.008*"take"')
(2, '0.017*"like" + 0.014*"sorry" + 0.013*"op" + 0.013*"thanks" + 0.013*"im" + 0.012*"np" + 0.012*"know"')
(3, '0.015*"help" + 0.015*"adhd" + 0.013*"ds" + 0.012*"get" + 0.011*"med" + 0.011*"kid" + 0.010*"really"')
(4, '0.014*"get" + 0.013*"go" + 0.013*"kid" + 0.011*"one" + 0.010*"time" + 0.010*"like" + 0.010*"say"')
Coherence: 0.47576671695597506, Perpl

In [8]:
n_topics = [6, 8]
for i in n_topics:
    n = "model_LDA_" + str(i)
    fn = n + ".gensim"
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics = i, id2word=dictionary, passes=15)
    path_model_i = path_model / fn
    ldamodel.save(str(path_model_i))
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    perplexity = ldamodel.log_perplexity(corpus)
    coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    lda_models[n] = {'model' : ldamodel, 'coherence' : coherence, 'perplexity': perplexity}
    print("LDA with {} topics".format(i))
    for topic in topics:
        print(topic)
    print("Coherence: {}, Perplexity {}".format(coherence, perplexity))
    print("\n")

LDA with 6 topics
(0, '0.020*"time" + 0.017*"college" + 0.015*"get" + 0.014*"need" + 0.013*"test" + 0.013*"teacher" + 0.012*"grade"')
(1, '0.020*"kid" + 0.017*"know" + 0.017*"like" + 0.015*"get" + 0.014*"dont" + 0.014*"think" + 0.012*"say"')
(2, '0.019*"ds" + 0.017*"med" + 0.017*"he" + 0.012*"years" + 0.012*"help" + 0.012*"adhd" + 0.011*"take"')
(3, '0.081*"school" + 0.018*"kid" + 0.015*"need" + 0.015*"get" + 0.014*"dc" + 0.012*"would" + 0.012*"sn"')
(4, '0.022*"yes" + 0.018*"special" + 0.013*"live" + 0.010*"idea" + 0.009*"pay" + 0.009*"insurance" + 0.009*"ny"')
(5, '0.019*"old" + 0.016*"social" + 0.015*"adhd" + 0.013*"kid" + 0.013*"issue" + 0.011*"anxiety" + 0.011*"ds"')
Coherence: 0.4686205623229143, Perplexity -7.844831987355975


LDA with 8 topics
(0, '0.039*"adhd" + 0.036*"med" + 0.029*"anxiety" + 0.021*"help" + 0.020*"therapy" + 0.014*"try" + 0.013*"ds"')
(1, '0.040*"kid" + 0.024*"think" + 0.018*"dont" + 0.015*"know" + 0.014*"like" + 0.012*"need" + 0.010*"social"')
(2, '0.088*"sc

Visualize the topics. See: https://www.objectorientedsubject.net/2018/08/experiments-on-topic-modeling-pyldavis/

In [10]:
pyLDAvis.display(pyLDAvis.gensim.prepare(lda_models["model_LDA_7"]["model"], corpus, dictionary, sort_topics=False))

## What is the Dominant Topic in each Post?

In [14]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_models["model_LDA_7"]["model"], corpus=corpus)
df_topic_sents_keywords.info()
df_topic_sents_keywords.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27227 entries, 0 to 27226
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dominant_Topic     27227 non-null  float64
 1   Perc_Contribution  27227 non-null  float64
 2   Topic_Keywords     27227 non-null  object 
dtypes: float64(2), object(1)
memory usage: 638.3+ KB


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,2.0,0.8217,"like, adhd, med, ds, im, kid, get, really, mak..."
1,5.0,0.6284,"would, child, therapy, therapist, help, one, f..."
2,4.0,0.4306,"school, kid, dc, college, sn, go, grade, good,..."
3,2.0,0.6926,"like, adhd, med, ds, im, kid, get, really, mak..."
4,2.0,0.3559,"like, adhd, med, ds, im, kid, get, really, mak..."


## Save Model Topics and Keywords in New Database

In [22]:
conn = sqlite3.connect(path_db)
df_topic_sents_keywords.to_sql('topicmodel', conn, if_exists='replace', index=False)