# Generate Topic Models
Generates the topic models of forum posts with LDA (Latent Dirichlet Allocation)

## Data Sources
- corpus (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)
- dictionary (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)
- lemmatized_text (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)

## Changes
- 2020-09-16: Created
- 2020-09-17: Found topic model with highest coherence and generated dominant topics
- 2020-12-19: Added new data

## TODO
- Tutorial
 - https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
 - https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python


## Imports

In [1]:
from gensim import corpora, models
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
import pandas as pd
from youbemom import create_connection
import csv
import os

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


## Functions

For formatting LDA

In [3]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(sent_topics_df)

In [4]:
def write_coherence(fn, n_workers, n_topics, coherence, perplexity):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow([n_workers, n_topics, coherence, perplexity])

In [5]:
def write_sample(fn, n, coherence, perplexity):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow([n, coherence, perplexity])

In [6]:
def write_list(fn, results):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow(results)

In [7]:
def flatten_topics(topics):
    return [t[1] for t in topics]

Load data

In [8]:
def load_data(forum="special-needs", group="parent"):
    lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group), 'rb'))
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group))
    return lemmatized_text, corpus, dictionary

## File Locations

In [9]:
p = Path.cwd()
path_parent = p.parents[0]

In [10]:
# data to load
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_{0}_{1}.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}.gensim")
# model saving
path_tune_models = str(path_parent / "clean_data" / "lda_tune_{0}_{1}_{2}.gensim")
# save all tuning results
path_a_b_t_tune = str(path_parent / "clean_data" / "a_b_t_tune_{0}_{1}_{2}_{3}.csv")
path_resample = str(path_parent / "clean_data" / "resample_10_topics_{0}_{1}.csv")
# path_coherence = str(path_parent / "clean_data" / "coherence_{}.csv")

## Special Needs: Parent Posts

### Load Data

In [None]:
lemmatized_text, corpus, dictionary = load_data('special-needs', 'parent')

### Perform LDA

Tune LDA on grid search to optimize hyperparameters n_topics, alpha, and beta (eta). Saves all coherence scores computed but only saves best topic model for each number of topics.

In [None]:
forum = 'special-needs'
group = 'parent'
search = 'grid'
run = "2"
NUM_WORDS = 10
n_iterations = 50
w = 15 # 16 cores (1 main + 15 workers)
a_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
b_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
best_models = {}
for n_topics in range(4, 21):
    model_name = "topics_{}".format(str(n_topics))
    for a in a_list:
        for b in b_list:
            print("topics: {}, alpha: {}, beta: {}".format(n_topics, a, b))
            %time ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
            topics = ldamodel.print_topics(num_words=NUM_WORDS)
            perplexity = ldamodel.log_perplexity(corpus)
            coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
            coherence = coherence_model.get_coherence()
            print(model_name)
            print(best_models.keys())
            if model_name not in best_models:
                best_models[model_name] = {"alpha":a, "beta":b, "coherence":coherence}
                ldamodel.save(path_tune_models.format(forum, group, str(n_topics)))
            else:
                print(best_models[model_name]["coherence"])
                print(coherence)
                if best_models[model_name]["coherence"] < coherence:
                    print("new best model")
                    best_models[model_name] = {"alpha":a, "beta":b, "coherence":coherence}
                    ldamodel.save(path_tune_models.format(forum, group, str(n_topics)))
            for topic in topics:
                write_list(path_a_b_t_tune.format(forum, group, search, run), [coherence, perplexity, n_topics, a, b, topic[0], topic[1]])

In [None]:
forum = 'special-needs'
group = 'parent'
n_topics = 10
a = 0.7
b = 0.1
NUM_WORDS = 10
n_iterations = 50
n_samples = 100
w = 15 # 16 cores (1 main + 15 workers)
for s in range(30, 101):
    print("sample: {}".format(s))
    %time ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    for topic in topics:
        write_list(path_resample.format(forum, group), [s, coherence, topic[0], topic[1]])

Visualize the topics. See: https://www.objectorientedsubject.net/2018/08/experiments-on-topic-modeling-pyldavis/

## Special Needs: All Posts

### Load data

In [11]:
lemmatized_text, corpus, dictionary = load_data('special-needs', 'all')

### Perform LDA

In [None]:
forum = 'special-needs'
group = 'all'
search = 'grid'
run = "1"
NUM_WORDS = 10
n_iterations = 50
w = 15 # 16 cores (1 main + 15 workers)
a_list = [.1, .3, .5, .7, .9]
b_list = [.1, .3, .5, .7, .9]
best_models = {}
for n_topics in range(7, 18):
    model_name = "topics_{}".format(str(n_topics))
    for a in a_list:
        for b in b_list:
            print("topics: {}, alpha: {}, beta: {}".format(n_topics, a, b))
            %time ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
            topics = ldamodel.print_topics(num_words=NUM_WORDS)
            perplexity = ldamodel.log_perplexity(corpus)
            coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
            coherence = coherence_model.get_coherence()
            print(model_name)
            print(best_models.keys())
            if model_name not in best_models:
                best_models[model_name] = {"alpha":a, "beta":b, "coherence":coherence}
                ldamodel.save(path_tune_models.format(forum, group, str(n_topics)))
            else:
                print(best_models[model_name]["coherence"])
                print(coherence)
                if best_models[model_name]["coherence"] < coherence:
                    print("new best model")
                    best_models[model_name] = {"alpha":a, "beta":b, "coherence":coherence}
                    ldamodel.save(path_tune_models.format(forum, group, str(n_topics)))
            for topic in topics:
                write_list(path_a_b_t_tune.format(forum, group, search, run), [coherence, perplexity, n_topics, a, b, topic[0], topic[1]])

In [12]:
forum = 'special-needs'
group = 'all'
n_topics = 15
a = 0.7
b = 0.1
NUM_WORDS = 10
n_iterations = 50
w = 15 # 16 cores (1 main + 15 workers)
%time ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()
for topic in topics:
    print(topic[1])

CPU times: user 6min 6s, sys: 2min 19s, total: 8min 25s
Wall time: 7min 22s
0.160*"get" + 0.027*"one" + 0.027*"still" + 0.024*"therapy" + 0.020*"old" + 0.019*"do" + 0.018*"services" + 0.016*"test" + 0.016*"2" + 0.012*"ot"
0.198*"kid" + 0.097*"dont" + 0.096*"think" + 0.055*"issue" + 0.035*"many" + 0.033*"well" + 0.022*"lot" + 0.022*"behavior" + 0.018*"problem" + 0.015*"understand"
0.080*"say" + 0.064*"want" + 0.050*"he" + 0.048*"im" + 0.035*"doesnt" + 0.028*"cant" + 0.026*"something" + 0.025*"never" + 0.023*"always" + 0.017*"anything"
0.235*"school" + 0.102*"dc" + 0.075*"sn" + 0.032*"class" + 0.031*"year" + 0.026*"private" + 0.022*"public" + 0.018*"move" + 0.014*"would" + 0.011*"option"
0.063*"would" + 0.057*"op" + 0.048*"tell" + 0.046*"right" + 0.039*"thats" + 0.036*"may" + 0.033*"didnt" + 0.033*"call" + 0.031*"maybe" + 0.030*"thanks"
0.091*"child" + 0.035*"support" + 0.029*"also" + 0.028*"program" + 0.021*"college" + 0.021*"go" + 0.020*"ld" + 0.018*"high" + 0.016*"student" + 0.016*"ca

In [14]:
print(coherence)

0.5742621474732515


In [13]:
ldamodel.save(path_tune_models.format(forum, group, str(n_topics)))

## What is the Dominant Topic in each Post?

In [None]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=corpus)
df_topic_sents_keywords.info()
df_topic_sents_keywords.head()

In [None]:
import sqlite3
path_db = str(path_parent / "database" / "youbemom-merged.db")
sql = '''
    SELECT s.text_no_url AS text_no_url, s.text as text
    FROM sentiment AS s
    JOIN posts AS p
    ON s.message_id = p.message_id
    WHERE p.subforum="special-needs" AND p.parent_id=""
'''
conn = create_connection(path_db)
sn = pd.read_sql_query(sql, conn)
sn.info()

## Save Model Topics and Keywords in New Database

In [None]:
conn = sqlite3.connect(path_db)
df_topic_sents_keywords.to_sql('topicmodel', conn, if_exists='replace', index=False)