# Generate Topic Models
Generates the topic models of forum posts with LDA (Latent Dirichlet Allocation)

## Data Sources
- corpus (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)
- dictionary (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)
- lemmatized_text (created with 3.0-Topic_Models-Lemmatize_Text.ipynb)

## Changes
- 2020-09-16: Created
- 2020-09-17: Found topic model with highest coherence and generated dominant topics
- 2020-12-19: Added new data

## TODO
- Tutorial
 - https://towardsdatascience.com/topic-modelling-in-python-with-nltk-and-gensim-4ef03213cd21
 - https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python


## Imports

In [4]:
from gensim import corpora, models
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
import pandas as pd
from youbemom import create_connection
import csv
import os

In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

  and should_run_async(code)


## Functions

For formatting LDA

In [6]:
def format_topics_sentences(ldamodel, corpus):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']
    return(sent_topics_df)

In [7]:
def write_coherence(fn, n_workers, n_topics, coherence, perplexity):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow([n_workers, n_topics, coherence, perplexity])

In [8]:
def write_sample(fn, n, coherence, perplexity):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow([n, coherence, perplexity])

In [9]:
def write_a_b_t_tune(fn, coherence, perplexity, n_topics, a, b, i, topic):
    with open(fn, 'a') as f:
        writer = csv.writer(f) 
        writer.writerow([coherence, perplexity, n_topics, a, b, i, topic])

In [10]:
def flatten_topics(topics):
    return [t[1] for t in topics]

Load data

In [11]:
def load_data(forum="special-needs", group="parent"):
    lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group), 'rb'))
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group))
    return lemmatized_text, corpus, dictionary

## File Locations

In [12]:
p = Path.cwd()
path_parent = p.parents[0]

In [13]:
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_{0}_{1}.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}.gensim")
# path_model = path_parent / "clean_data"
path_a_b_t_tune = str(path_parent / "clean_data" / "a_b_t_tune_{0}_{1}_{2}.csv")
# path_coherence = str(path_parent / "clean_data" / "coherence_{}.csv")

## Special Needs

### Load Data

In [14]:
lemmatized_text, corpus, dictionary = load_data('special-needs', 'parent')

### Perform LDA

Examine difference between a and b ranges

In [15]:
# forum = 'special-needs'
# group = 'parent'
# search = 'grid'
# NUM_WORDS = 10
# n_iterations = 50
# w = 9 # 10 cores (1 main + 9 workers)
# a_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
# b_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
# for n_topics in range(2, 21):
#     for a in a_list:
#         for b in b_list:
#             print("topics: {}, alpha: {}, beta: {}".format(n_topics, a, b))
#             %time ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
#             topics = ldamodel.print_topics(num_words=NUM_WORDS)
#             perplexity = ldamodel.log_perplexity(corpus)
#             coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
#             coherence = coherence_model.get_coherence()
#             for topic in topics:
#                 write_a_b_t_tune(path_a_b_t_tune.format(forum, group, search), coherence, perplexity, n_topics, a, b, topic[0], topic[1])

In [17]:
forum = 'special-needs'
group = 'parent'
search = 'grid'
NUM_WORDS = 10
n_iterations = 50
w = 9 # 10 cores (1 main + 9 workers)
a_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
b_list = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
for n_topics in range(3, 21):
    for a in a_list:
        for b in b_list:
            print("topics: {}, alpha: {}, beta: {}".format(n_topics, a, b))
            %time ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
            topics = ldamodel.print_topics(num_words=NUM_WORDS)
            perplexity = ldamodel.log_perplexity(corpus)
            coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
            coherence = coherence_model.get_coherence()
            for topic in topics:
                write_a_b_t_tune(path_a_b_t_tune.format(forum, group, search), coherence, perplexity, n_topics, a, b, topic[0], topic[1])

topics: 3, alpha: 0.1, beta: 0.1
CPU times: user 1min 55s, sys: 18.6 s, total: 2min 14s
Wall time: 1min 56s
topics: 3, alpha: 0.1, beta: 0.2
CPU times: user 1min 51s, sys: 19.5 s, total: 2min 11s
Wall time: 1min 52s
topics: 3, alpha: 0.1, beta: 0.3
CPU times: user 1min 57s, sys: 20.1 s, total: 2min 17s
Wall time: 1min 57s
topics: 3, alpha: 0.1, beta: 0.4
CPU times: user 1min 53s, sys: 17.1 s, total: 2min 10s
Wall time: 1min 52s
topics: 3, alpha: 0.1, beta: 0.5
CPU times: user 2min 1s, sys: 19.1 s, total: 2min 20s
Wall time: 2min 1s
topics: 3, alpha: 0.1, beta: 0.6
CPU times: user 1min 50s, sys: 17.3 s, total: 2min 7s
Wall time: 1min 50s
topics: 3, alpha: 0.1, beta: 0.7
CPU times: user 1min 47s, sys: 13.4 s, total: 2min 1s
Wall time: 1min 48s
topics: 3, alpha: 0.1, beta: 0.8
CPU times: user 1min 47s, sys: 17.4 s, total: 2min 5s
Wall time: 1min 48s
topics: 3, alpha: 0.1, beta: 0.9
CPU times: user 2min 8s, sys: 26.8 s, total: 2min 35s
Wall time: 2min 9s
topics: 3, alpha: 0.2, beta: 0.1
CP

topics: 3, alpha: 0.9, beta: 0.6
CPU times: user 2min 6s, sys: 26.7 s, total: 2min 33s
Wall time: 2min 7s
topics: 3, alpha: 0.9, beta: 0.7
CPU times: user 2min 10s, sys: 29.2 s, total: 2min 39s
Wall time: 2min 11s
topics: 3, alpha: 0.9, beta: 0.8
CPU times: user 2min 5s, sys: 27 s, total: 2min 32s
Wall time: 2min 6s
topics: 3, alpha: 0.9, beta: 0.9
CPU times: user 2min 19s, sys: 31.7 s, total: 2min 51s
Wall time: 2min 20s
topics: 4, alpha: 0.1, beta: 0.1
CPU times: user 2min 11s, sys: 25.6 s, total: 2min 37s
Wall time: 2min 12s
topics: 4, alpha: 0.1, beta: 0.2
CPU times: user 2min 6s, sys: 25.3 s, total: 2min 32s
Wall time: 2min 7s
topics: 4, alpha: 0.1, beta: 0.3
CPU times: user 2min 8s, sys: 25.8 s, total: 2min 34s
Wall time: 2min 9s
topics: 4, alpha: 0.1, beta: 0.4
CPU times: user 2min 2s, sys: 21.8 s, total: 2min 23s
Wall time: 2min 1s
topics: 4, alpha: 0.1, beta: 0.5
CPU times: user 2min 7s, sys: 24.9 s, total: 2min 32s
Wall time: 2min 8s
topics: 4, alpha: 0.1, beta: 0.6
CPU times

topics: 4, alpha: 0.9, beta: 0.2
CPU times: user 2min, sys: 21.6 s, total: 2min 21s
Wall time: 1min 59s
topics: 4, alpha: 0.9, beta: 0.3
CPU times: user 1min 57s, sys: 20 s, total: 2min 17s
Wall time: 1min 56s
topics: 4, alpha: 0.9, beta: 0.4
CPU times: user 2min 4s, sys: 23.5 s, total: 2min 28s
Wall time: 2min 4s
topics: 4, alpha: 0.9, beta: 0.5
CPU times: user 2min 2s, sys: 22.5 s, total: 2min 24s
Wall time: 2min 2s
topics: 4, alpha: 0.9, beta: 0.6
CPU times: user 2min 5s, sys: 24.9 s, total: 2min 29s
Wall time: 2min 5s
topics: 4, alpha: 0.9, beta: 0.7
CPU times: user 2min 7s, sys: 25.8 s, total: 2min 33s
Wall time: 2min 8s
topics: 4, alpha: 0.9, beta: 0.8
CPU times: user 2min 5s, sys: 25.1 s, total: 2min 30s
Wall time: 2min 6s
topics: 4, alpha: 0.9, beta: 0.9
CPU times: user 1min 56s, sys: 22.1 s, total: 2min 18s
Wall time: 1min 57s
topics: 5, alpha: 0.1, beta: 0.1
CPU times: user 2min 13s, sys: 27.1 s, total: 2min 40s
Wall time: 2min 14s
topics: 5, alpha: 0.1, beta: 0.2
CPU times: 

topics: 5, alpha: 0.8, beta: 0.7
CPU times: user 2min 10s, sys: 28 s, total: 2min 38s
Wall time: 2min 11s
topics: 5, alpha: 0.8, beta: 0.8
CPU times: user 2min 10s, sys: 29.9 s, total: 2min 40s
Wall time: 2min 11s
topics: 5, alpha: 0.8, beta: 0.9
CPU times: user 2min 16s, sys: 31.5 s, total: 2min 48s
Wall time: 2min 17s
topics: 5, alpha: 0.9, beta: 0.1
CPU times: user 2min 8s, sys: 28.1 s, total: 2min 36s
Wall time: 2min 9s
topics: 5, alpha: 0.9, beta: 0.2
CPU times: user 2min 5s, sys: 25.4 s, total: 2min 31s
Wall time: 2min 6s
topics: 5, alpha: 0.9, beta: 0.3
CPU times: user 2min 11s, sys: 30.3 s, total: 2min 41s
Wall time: 2min 12s
topics: 5, alpha: 0.9, beta: 0.4
CPU times: user 2min 17s, sys: 30.6 s, total: 2min 47s
Wall time: 2min 17s
topics: 5, alpha: 0.9, beta: 0.5
CPU times: user 2min 4s, sys: 25.8 s, total: 2min 30s
Wall time: 2min 4s
topics: 5, alpha: 0.9, beta: 0.6
CPU times: user 2min 7s, sys: 27.7 s, total: 2min 35s
Wall time: 2min 9s
topics: 5, alpha: 0.9, beta: 0.7
CPU t

topics: 6, alpha: 0.8, beta: 0.3
CPU times: user 2min 3s, sys: 25.8 s, total: 2min 29s
Wall time: 2min 4s
topics: 6, alpha: 0.8, beta: 0.4
CPU times: user 2min 6s, sys: 26.4 s, total: 2min 32s
Wall time: 2min 7s
topics: 6, alpha: 0.8, beta: 0.5
CPU times: user 2min 14s, sys: 29.1 s, total: 2min 43s
Wall time: 2min 15s
topics: 6, alpha: 0.8, beta: 0.6
CPU times: user 2min 15s, sys: 28.9 s, total: 2min 44s
Wall time: 2min 16s
topics: 6, alpha: 0.8, beta: 0.7
CPU times: user 2min 7s, sys: 28.2 s, total: 2min 36s
Wall time: 2min 8s
topics: 6, alpha: 0.8, beta: 0.8
CPU times: user 2min 21s, sys: 33.4 s, total: 2min 54s
Wall time: 2min 22s
topics: 6, alpha: 0.8, beta: 0.9
CPU times: user 2min 8s, sys: 28.8 s, total: 2min 37s
Wall time: 2min 9s
topics: 6, alpha: 0.9, beta: 0.1
CPU times: user 2min 6s, sys: 28.2 s, total: 2min 34s
Wall time: 2min 7s
topics: 6, alpha: 0.9, beta: 0.2
CPU times: user 2min 6s, sys: 28.7 s, total: 2min 35s
Wall time: 2min 8s
topics: 6, alpha: 0.9, beta: 0.3
CPU tim

topics: 7, alpha: 0.7, beta: 0.8
CPU times: user 2min 13s, sys: 31.1 s, total: 2min 44s
Wall time: 2min 14s
topics: 7, alpha: 0.7, beta: 0.9
CPU times: user 2min 9s, sys: 27.4 s, total: 2min 37s
Wall time: 2min 10s
topics: 7, alpha: 0.8, beta: 0.1
CPU times: user 2min 1s, sys: 23.9 s, total: 2min 25s
Wall time: 2min 2s
topics: 7, alpha: 0.8, beta: 0.2
CPU times: user 1min 56s, sys: 21.7 s, total: 2min 18s
Wall time: 1min 57s
topics: 7, alpha: 0.8, beta: 0.3
CPU times: user 1min 58s, sys: 21.8 s, total: 2min 20s
Wall time: 1min 58s
topics: 7, alpha: 0.8, beta: 0.4
CPU times: user 2min 4s, sys: 26.3 s, total: 2min 30s
Wall time: 2min 5s
topics: 7, alpha: 0.8, beta: 0.5
CPU times: user 2min 2s, sys: 26.5 s, total: 2min 29s
Wall time: 2min 3s
topics: 7, alpha: 0.8, beta: 0.6
CPU times: user 2min 14s, sys: 29.2 s, total: 2min 44s
Wall time: 2min 15s
topics: 7, alpha: 0.8, beta: 0.7
CPU times: user 2min 12s, sys: 30.5 s, total: 2min 43s
Wall time: 2min 13s
topics: 7, alpha: 0.8, beta: 0.8
CP

topics: 8, alpha: 0.7, beta: 0.4
CPU times: user 1min 57s, sys: 21.7 s, total: 2min 19s
Wall time: 1min 58s
topics: 8, alpha: 0.7, beta: 0.5
CPU times: user 2min 6s, sys: 23.9 s, total: 2min 30s
Wall time: 2min 6s
topics: 8, alpha: 0.7, beta: 0.6
CPU times: user 2min 26s, sys: 35.4 s, total: 3min 1s
Wall time: 2min 27s
topics: 8, alpha: 0.7, beta: 0.7
CPU times: user 2min 7s, sys: 27.3 s, total: 2min 34s
Wall time: 2min 8s
topics: 8, alpha: 0.7, beta: 0.8
CPU times: user 2min 5s, sys: 25.4 s, total: 2min 31s
Wall time: 2min 6s
topics: 8, alpha: 0.7, beta: 0.9
CPU times: user 2min 8s, sys: 27.1 s, total: 2min 35s
Wall time: 2min 8s
topics: 8, alpha: 0.8, beta: 0.1
CPU times: user 2min 2s, sys: 26.2 s, total: 2min 29s
Wall time: 2min 3s
topics: 8, alpha: 0.8, beta: 0.2
CPU times: user 2min 17s, sys: 30.8 s, total: 2min 48s
Wall time: 2min 17s
topics: 8, alpha: 0.8, beta: 0.3
CPU times: user 2min 1s, sys: 24.1 s, total: 2min 25s
Wall time: 2min 2s
topics: 8, alpha: 0.8, beta: 0.4
CPU time

topics: 9, alpha: 0.6, beta: 0.9
CPU times: user 2min 27s, sys: 34.7 s, total: 3min 2s
Wall time: 2min 28s
topics: 9, alpha: 0.7, beta: 0.1
CPU times: user 2min 5s, sys: 26.7 s, total: 2min 31s
Wall time: 2min 6s
topics: 9, alpha: 0.7, beta: 0.2
CPU times: user 2min 4s, sys: 24.2 s, total: 2min 28s
Wall time: 2min 5s
topics: 9, alpha: 0.7, beta: 0.3
CPU times: user 2min 8s, sys: 26.3 s, total: 2min 35s
Wall time: 2min 9s
topics: 9, alpha: 0.7, beta: 0.4
CPU times: user 2min 14s, sys: 28.3 s, total: 2min 43s
Wall time: 2min 15s
topics: 9, alpha: 0.7, beta: 0.5
CPU times: user 2min 4s, sys: 26.4 s, total: 2min 31s
Wall time: 2min 6s
topics: 9, alpha: 0.7, beta: 0.6
CPU times: user 2min 7s, sys: 24.9 s, total: 2min 32s
Wall time: 2min 8s
topics: 9, alpha: 0.7, beta: 0.7
CPU times: user 2min 14s, sys: 30.1 s, total: 2min 44s
Wall time: 2min 15s
topics: 9, alpha: 0.7, beta: 0.8
CPU times: user 1min 56s, sys: 22 s, total: 2min 18s
Wall time: 1min 57s
topics: 9, alpha: 0.7, beta: 0.9
CPU time

CPU times: user 2min 11s, sys: 28.5 s, total: 2min 40s
Wall time: 2min 13s
topics: 10, alpha: 0.6, beta: 0.5
CPU times: user 2min 12s, sys: 26.1 s, total: 2min 38s
Wall time: 2min 13s
topics: 10, alpha: 0.6, beta: 0.6
CPU times: user 2min 4s, sys: 25.3 s, total: 2min 29s
Wall time: 2min 5s
topics: 10, alpha: 0.6, beta: 0.7
CPU times: user 2min 11s, sys: 27.7 s, total: 2min 38s
Wall time: 2min 12s
topics: 10, alpha: 0.6, beta: 0.8
CPU times: user 2min 7s, sys: 25.8 s, total: 2min 33s
Wall time: 2min 8s
topics: 10, alpha: 0.6, beta: 0.9
CPU times: user 2min 14s, sys: 28.6 s, total: 2min 43s
Wall time: 2min 15s
topics: 10, alpha: 0.7, beta: 0.1
CPU times: user 2min 1s, sys: 24.1 s, total: 2min 26s
Wall time: 2min 3s
topics: 10, alpha: 0.7, beta: 0.2
CPU times: user 2min 2s, sys: 23.5 s, total: 2min 26s
Wall time: 2min 3s
topics: 10, alpha: 0.7, beta: 0.3
CPU times: user 2min 10s, sys: 28.4 s, total: 2min 39s
Wall time: 2min 11s
topics: 10, alpha: 0.7, beta: 0.4
CPU times: user 1min 58s, s

CPU times: user 2min 17s, sys: 26.4 s, total: 2min 44s
Wall time: 2min 18s
topics: 11, alpha: 0.5, beta: 0.9
CPU times: user 2min 8s, sys: 24.9 s, total: 2min 33s
Wall time: 2min 8s
topics: 11, alpha: 0.6, beta: 0.1
CPU times: user 2min 14s, sys: 29.2 s, total: 2min 43s
Wall time: 2min 15s
topics: 11, alpha: 0.6, beta: 0.2
CPU times: user 2min 12s, sys: 27.1 s, total: 2min 39s
Wall time: 2min 13s
topics: 11, alpha: 0.6, beta: 0.3
CPU times: user 2min 5s, sys: 24.8 s, total: 2min 30s
Wall time: 2min 6s
topics: 11, alpha: 0.6, beta: 0.4
CPU times: user 2min 3s, sys: 22.5 s, total: 2min 25s
Wall time: 2min 4s
topics: 11, alpha: 0.6, beta: 0.5
CPU times: user 2min 15s, sys: 29.4 s, total: 2min 45s
Wall time: 2min 16s
topics: 11, alpha: 0.6, beta: 0.6
CPU times: user 2min 7s, sys: 25.6 s, total: 2min 32s
Wall time: 2min 7s
topics: 11, alpha: 0.6, beta: 0.7
CPU times: user 2min 15s, sys: 29.2 s, total: 2min 44s
Wall time: 2min 16s
topics: 11, alpha: 0.6, beta: 0.8
CPU times: user 2min 13s, s

CPU times: user 2min 29s, sys: 33.4 s, total: 3min 2s
Wall time: 2min 29s
topics: 12, alpha: 0.5, beta: 0.4
CPU times: user 2min 28s, sys: 31.1 s, total: 2min 59s
Wall time: 2min 29s
topics: 12, alpha: 0.5, beta: 0.5
CPU times: user 2min 16s, sys: 27.7 s, total: 2min 43s
Wall time: 2min 16s
topics: 12, alpha: 0.5, beta: 0.6
CPU times: user 2min 8s, sys: 27.6 s, total: 2min 36s
Wall time: 2min 9s
topics: 12, alpha: 0.5, beta: 0.7
CPU times: user 2min 28s, sys: 31.6 s, total: 3min
Wall time: 2min 30s
topics: 12, alpha: 0.5, beta: 0.8
CPU times: user 2min 5s, sys: 23.8 s, total: 2min 29s
Wall time: 2min 6s
topics: 12, alpha: 0.5, beta: 0.9
CPU times: user 2min 24s, sys: 30.4 s, total: 2min 54s
Wall time: 2min 25s
topics: 12, alpha: 0.6, beta: 0.1
CPU times: user 2min 1s, sys: 22.8 s, total: 2min 24s
Wall time: 2min 2s
topics: 12, alpha: 0.6, beta: 0.2
CPU times: user 1min 55s, sys: 18.8 s, total: 2min 14s
Wall time: 1min 56s
topics: 12, alpha: 0.6, beta: 0.3
CPU times: user 2min 20s, sys:

CPU times: user 2min 31s, sys: 33 s, total: 3min 4s
Wall time: 2min 31s
topics: 13, alpha: 0.4, beta: 0.8
CPU times: user 2min 7s, sys: 25.7 s, total: 2min 33s
Wall time: 2min 8s
topics: 13, alpha: 0.4, beta: 0.9
CPU times: user 2min 11s, sys: 23.9 s, total: 2min 35s
Wall time: 2min 11s
topics: 13, alpha: 0.5, beta: 0.1
CPU times: user 2min 5s, sys: 22.8 s, total: 2min 27s
Wall time: 2min 6s
topics: 13, alpha: 0.5, beta: 0.2
CPU times: user 2min 11s, sys: 25.5 s, total: 2min 37s
Wall time: 2min 12s
topics: 13, alpha: 0.5, beta: 0.3
CPU times: user 2min 7s, sys: 22.9 s, total: 2min 30s
Wall time: 2min 7s
topics: 13, alpha: 0.5, beta: 0.4
CPU times: user 2min 21s, sys: 27.9 s, total: 2min 49s
Wall time: 2min 21s
topics: 13, alpha: 0.5, beta: 0.5
CPU times: user 2min 29s, sys: 32.3 s, total: 3min 1s
Wall time: 2min 30s
topics: 13, alpha: 0.5, beta: 0.6
CPU times: user 2min 11s, sys: 22.7 s, total: 2min 33s
Wall time: 2min 11s
topics: 13, alpha: 0.5, beta: 0.7
CPU times: user 2min 22s, sys

CPU times: user 2min 30s, sys: 29.5 s, total: 3min
Wall time: 2min 31s
topics: 14, alpha: 0.4, beta: 0.3
CPU times: user 2min 32s, sys: 31 s, total: 3min 3s
Wall time: 2min 33s
topics: 14, alpha: 0.4, beta: 0.4
CPU times: user 2min 34s, sys: 32.2 s, total: 3min 6s
Wall time: 2min 36s
topics: 14, alpha: 0.4, beta: 0.5
CPU times: user 2min 30s, sys: 33.5 s, total: 3min 4s
Wall time: 2min 32s
topics: 14, alpha: 0.4, beta: 0.6
CPU times: user 2min 22s, sys: 27.2 s, total: 2min 49s
Wall time: 2min 22s
topics: 14, alpha: 0.4, beta: 0.7
CPU times: user 2min 18s, sys: 27.5 s, total: 2min 45s
Wall time: 2min 19s
topics: 14, alpha: 0.4, beta: 0.8
CPU times: user 2min 25s, sys: 33.7 s, total: 2min 59s
Wall time: 2min 26s
topics: 14, alpha: 0.4, beta: 0.9
CPU times: user 2min 11s, sys: 23.8 s, total: 2min 35s
Wall time: 2min 11s
topics: 14, alpha: 0.5, beta: 0.1
CPU times: user 2min 17s, sys: 27.7 s, total: 2min 45s
Wall time: 2min 18s
topics: 14, alpha: 0.5, beta: 0.2
CPU times: user 2min 2s, sys

CPU times: user 2min 15s, sys: 26.8 s, total: 2min 42s
Wall time: 2min 16s
topics: 15, alpha: 0.3, beta: 0.7
CPU times: user 2min 22s, sys: 27.2 s, total: 2min 49s
Wall time: 2min 23s
topics: 15, alpha: 0.3, beta: 0.8
CPU times: user 2min 20s, sys: 26.7 s, total: 2min 47s
Wall time: 2min 21s
topics: 15, alpha: 0.3, beta: 0.9
CPU times: user 1min 57s, sys: 25.2 s, total: 2min 23s
Wall time: 1min 58s
topics: 15, alpha: 0.4, beta: 0.1
CPU times: user 2min 17s, sys: 22.6 s, total: 2min 40s
Wall time: 2min 18s
topics: 15, alpha: 0.4, beta: 0.2
CPU times: user 2min 19s, sys: 26.4 s, total: 2min 46s
Wall time: 2min 20s
topics: 15, alpha: 0.4, beta: 0.3
CPU times: user 2min 26s, sys: 28.3 s, total: 2min 54s
Wall time: 2min 27s
topics: 15, alpha: 0.4, beta: 0.4
CPU times: user 2min 18s, sys: 26.2 s, total: 2min 44s
Wall time: 2min 19s
topics: 15, alpha: 0.4, beta: 0.5
CPU times: user 2min 18s, sys: 27.4 s, total: 2min 45s
Wall time: 2min 19s
topics: 15, alpha: 0.4, beta: 0.6
CPU times: user 2mi

CPU times: user 2min 41s, sys: 36.6 s, total: 3min 18s
Wall time: 2min 44s
topics: 16, alpha: 0.3, beta: 0.2
CPU times: user 2min 17s, sys: 25.7 s, total: 2min 43s
Wall time: 2min 20s
topics: 16, alpha: 0.3, beta: 0.3
CPU times: user 2min 11s, sys: 25.3 s, total: 2min 36s
Wall time: 2min 14s
topics: 16, alpha: 0.3, beta: 0.4
CPU times: user 2min 15s, sys: 25.4 s, total: 2min 41s
Wall time: 2min 18s
topics: 16, alpha: 0.3, beta: 0.5
CPU times: user 2min 19s, sys: 28.8 s, total: 2min 48s
Wall time: 2min 22s
topics: 16, alpha: 0.3, beta: 0.6
CPU times: user 2min 23s, sys: 28.8 s, total: 2min 51s
Wall time: 2min 25s
topics: 16, alpha: 0.3, beta: 0.7
CPU times: user 2min 25s, sys: 32.1 s, total: 2min 57s
Wall time: 2min 29s
topics: 16, alpha: 0.3, beta: 0.8
CPU times: user 2min 13s, sys: 30.5 s, total: 2min 43s
Wall time: 2min 16s
topics: 16, alpha: 0.3, beta: 0.9
CPU times: user 2min 26s, sys: 30 s, total: 2min 56s
Wall time: 2min 29s
topics: 16, alpha: 0.4, beta: 0.1
CPU times: user 2min 

CPU times: user 2min 14s, sys: 31.5 s, total: 2min 45s
Wall time: 2min 17s
topics: 17, alpha: 0.2, beta: 0.6
CPU times: user 2min 12s, sys: 28.2 s, total: 2min 40s
Wall time: 2min 15s
topics: 17, alpha: 0.2, beta: 0.7
CPU times: user 2min 14s, sys: 27.7 s, total: 2min 41s
Wall time: 2min 17s
topics: 17, alpha: 0.2, beta: 0.8
CPU times: user 1min 59s, sys: 25.7 s, total: 2min 25s
Wall time: 2min 2s
topics: 17, alpha: 0.2, beta: 0.9
CPU times: user 2min 1s, sys: 25.4 s, total: 2min 26s
Wall time: 2min 4s
topics: 17, alpha: 0.3, beta: 0.1
CPU times: user 2min 28s, sys: 30.8 s, total: 2min 59s
Wall time: 2min 31s
topics: 17, alpha: 0.3, beta: 0.2
CPU times: user 2min 37s, sys: 34.6 s, total: 3min 11s
Wall time: 2min 40s
topics: 17, alpha: 0.3, beta: 0.3
CPU times: user 2min 24s, sys: 28.9 s, total: 2min 53s
Wall time: 2min 27s
topics: 17, alpha: 0.3, beta: 0.4
CPU times: user 2min 14s, sys: 28.5 s, total: 2min 42s
Wall time: 2min 17s
topics: 17, alpha: 0.3, beta: 0.5
CPU times: user 2min 1

CPU times: user 2min 6s, sys: 27.5 s, total: 2min 33s
Wall time: 2min 9s
topics: 18, alpha: 0.2, beta: 0.1
CPU times: user 2min 16s, sys: 27.1 s, total: 2min 43s
Wall time: 2min 19s
topics: 18, alpha: 0.2, beta: 0.2
CPU times: user 2min 23s, sys: 30.6 s, total: 2min 53s
Wall time: 2min 26s
topics: 18, alpha: 0.2, beta: 0.3
CPU times: user 2min 14s, sys: 26.2 s, total: 2min 40s
Wall time: 2min 17s
topics: 18, alpha: 0.2, beta: 0.4
CPU times: user 2min 10s, sys: 26.1 s, total: 2min 36s
Wall time: 2min 13s
topics: 18, alpha: 0.2, beta: 0.5
CPU times: user 2min 13s, sys: 27.1 s, total: 2min 40s
Wall time: 2min 16s
topics: 18, alpha: 0.2, beta: 0.6
CPU times: user 2min 15s, sys: 27.8 s, total: 2min 42s
Wall time: 2min 17s
topics: 18, alpha: 0.2, beta: 0.7
CPU times: user 2min 8s, sys: 26.5 s, total: 2min 34s
Wall time: 2min 11s
topics: 18, alpha: 0.2, beta: 0.8
CPU times: user 2min 14s, sys: 29.1 s, total: 2min 43s
Wall time: 2min 17s
topics: 18, alpha: 0.2, beta: 0.9
CPU times: user 1min 4

CPU times: user 1min 58s, sys: 23 s, total: 2min 21s
Wall time: 2min 2s
topics: 19, alpha: 0.1, beta: 0.5
CPU times: user 2min 10s, sys: 28.6 s, total: 2min 38s
Wall time: 2min 13s
topics: 19, alpha: 0.1, beta: 0.6
CPU times: user 2min, sys: 24.8 s, total: 2min 25s
Wall time: 2min 3s
topics: 19, alpha: 0.1, beta: 0.7
CPU times: user 2min 6s, sys: 27.1 s, total: 2min 33s
Wall time: 2min 9s
topics: 19, alpha: 0.1, beta: 0.8
CPU times: user 1min 55s, sys: 24 s, total: 2min 19s
Wall time: 1min 58s
topics: 19, alpha: 0.1, beta: 0.9
CPU times: user 2min 8s, sys: 28.4 s, total: 2min 37s
Wall time: 2min 12s
topics: 19, alpha: 0.2, beta: 0.1
CPU times: user 2min 16s, sys: 27.3 s, total: 2min 43s
Wall time: 2min 19s
topics: 19, alpha: 0.2, beta: 0.2
CPU times: user 2min 15s, sys: 28.6 s, total: 2min 43s
Wall time: 2min 18s
topics: 19, alpha: 0.2, beta: 0.3
CPU times: user 2min 16s, sys: 28.3 s, total: 2min 44s
Wall time: 2min 18s
topics: 19, alpha: 0.2, beta: 0.4
CPU times: user 2min 16s, sys: 2

CPU times: user 1min 52s, sys: 25.5 s, total: 2min 18s
Wall time: 1min 55s
topics: 19, alpha: 0.9, beta: 0.9
CPU times: user 1min 48s, sys: 23.5 s, total: 2min 11s
Wall time: 1min 51s
topics: 20, alpha: 0.1, beta: 0.1
CPU times: user 2min 24s, sys: 31.9 s, total: 2min 56s
Wall time: 2min 27s
topics: 20, alpha: 0.1, beta: 0.2
CPU times: user 2min 1s, sys: 22.1 s, total: 2min 23s
Wall time: 2min 5s
topics: 20, alpha: 0.1, beta: 0.3
CPU times: user 2min 14s, sys: 29.4 s, total: 2min 43s
Wall time: 2min 17s
topics: 20, alpha: 0.1, beta: 0.4
CPU times: user 2min 10s, sys: 27.5 s, total: 2min 38s
Wall time: 2min 14s
topics: 20, alpha: 0.1, beta: 0.5
CPU times: user 2min 26s, sys: 33.6 s, total: 3min
Wall time: 2min 29s
topics: 20, alpha: 0.1, beta: 0.6
CPU times: user 2min 6s, sys: 26.4 s, total: 2min 32s
Wall time: 2min 10s
topics: 20, alpha: 0.1, beta: 0.7
CPU times: user 2min 5s, sys: 27.2 s, total: 2min 32s
Wall time: 2min 8s
topics: 20, alpha: 0.1, beta: 0.8
CPU times: user 2min 7s, sys

CPU times: user 2min 4s, sys: 30.2 s, total: 2min 34s
Wall time: 2min 7s
topics: 20, alpha: 0.9, beta: 0.4
CPU times: user 2min 1s, sys: 29.8 s, total: 2min 31s
Wall time: 2min 4s
topics: 20, alpha: 0.9, beta: 0.5
CPU times: user 1min 50s, sys: 24.2 s, total: 2min 14s
Wall time: 1min 54s
topics: 20, alpha: 0.9, beta: 0.6
CPU times: user 1min 56s, sys: 25.8 s, total: 2min 22s
Wall time: 1min 59s
topics: 20, alpha: 0.9, beta: 0.7
CPU times: user 1min 44s, sys: 23.1 s, total: 2min 8s
Wall time: 1min 48s
topics: 20, alpha: 0.9, beta: 0.8
CPU times: user 2min 4s, sys: 30 s, total: 2min 34s
Wall time: 2min 7s
topics: 20, alpha: 0.9, beta: 0.9
CPU times: user 1min 56s, sys: 27.3 s, total: 2min 24s
Wall time: 1min 59s


In [20]:
w = 9
n_topics = 7
a = 0.4
b = 0.9
NUM_WORDS = 10
n_iterations = 50
ldamodel = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
topics = ldamodel.print_topics(num_words=NUM_WORDS)
coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
coherence = coherence_model.get_coherence()

In [21]:
print(coherence)
for topic in topics:
    print(topic)

0.5472676905421523
(0, '0.030*"med" + 0.026*"anyone" + 0.020*"dc" + 0.017*"adhd" + 0.012*"anxiety" + 0.009*"take" + 0.009*"try" + 0.008*"dr" + 0.007*"medication" + 0.007*"psychiatrist"')
(1, '0.032*"school" + 0.014*"anyone" + 0.010*"meeting" + 0.010*"get" + 0.009*"iep" + 0.008*"sn" + 0.008*"child" + 0.008*"doe" + 0.008*"dc" + 0.008*"camp"')
(2, '0.015*"ds" + 0.013*"get" + 0.012*"kid" + 0.010*"like" + 0.009*"say" + 0.008*"want" + 0.007*"im" + 0.007*"he" + 0.007*"go" + 0.007*"time"')
(3, '0.029*"school" + 0.012*"kid" + 0.010*"need" + 0.010*"dc" + 0.009*"sn" + 0.009*"child" + 0.009*"anyone" + 0.009*"would" + 0.008*"issue" + 0.008*"grade"')
(4, '0.006*"love" + 0.003*"vashikaran" + 0.003*"spell" + 0.002*"problem" + 0.002*"black" + 0.002*"trump" + 0.002*"specialist" + 0.002*"buy" + 0.002*"visit" + 0.002*"magic"')
(5, '0.004*"article" + 0.002*"nick" + 0.002*"seivert" + 0.001*"nyt" + 0.001*"fm" + 0.001*"david" + 0.001*"village" + 0.001*"bklyn" + 0.001*"audiologist" + 0.001*"runner"')
(6, '0.00

In [24]:
pyLDAvis.display(pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False))

In [None]:
NUM_WORDS = 7
n_workers = [7,6,5,4,3,2,1,0]
n_topics = [2,3,4,5,6,7,8,9]
for t in n_topics:
    for w in n_workers:
        n = "model_LDA_{}_{}".format(str(t), "special-needs")
        fn = n + ".gensim"
        if w == 0:
            %time ldamodel = models.ldamodel.LdaModel(corpus, num_topics = t, id2word=dictionary, passes=15)
        else:
            %time ldamodel = models.LdaMulticore(corpus, num_topics = t, id2word=dictionary, passes=15, workers=w)
        topics = ldamodel.print_topics(num_words=NUM_WORDS)
        perplexity = ldamodel.log_perplexity(corpus)
        coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
        coherence = coherence_model.get_coherence()
        print("LDA with {} topics and {} workers".format(t, w))
        for topic in topics:
            print(topic)
        print("Coherence: {}, Perplexity {}".format(coherence, perplexity))
        write_coherence(path_coherence.format("special-needs"), w, t, coherence, perplexity)

In [None]:
NUM_WORDS = 7
n_workers = [9,8]
n_topics = [2,3,4,5,6,7,8,9]
for t in n_topics:
    for w in n_workers:
        if w == 0:
            %time ldamodel = models.ldamodel.LdaModel(corpus, num_topics = t, id2word=dictionary, passes=15)
        else:
            %time ldamodel = models.LdaMulticore(corpus, num_topics = t, id2word=dictionary, passes=15, workers=w)
        topics = ldamodel.print_topics(num_words=NUM_WORDS)
        perplexity = ldamodel.log_perplexity(corpus)
        coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
        coherence = coherence_model.get_coherence()
        print("LDA with {} topics and {} workers".format(t, w))
        for topic in topics:
            print(topic)
        print("Coherence: {}, Perplexity {}".format(coherence, perplexity))
        write_coherence(path_coherence.format("special-needs"), w, t, coherence, perplexity)

In [None]:
NUM_WORDS = 7
NUM_WORKERS = 9
for t in range(2,31):
    for s in range(20):
        %time ldamodel = models.LdaMulticore(corpus, num_topics=t, id2word=dictionary, passes=15, workers=NUM_WORKERS)
        topics = ldamodel.print_topics(num_words=NUM_WORDS)
        perplexity = ldamodel.log_perplexity(corpus)
        coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
        coherence = coherence_model.get_coherence()
        print("LDA with {} topics, sample {}".format(t, s))
        for topic in topics:
            print(topic)
        print("Coherence: {}, Perplexity {}".format(coherence, perplexity))
        write_coherence(path_coherence.format("special-needs_parents"), s, t, coherence, perplexity)

In [None]:
print([1,2,3]+[4,5,6])

Compare coherence and topics, run 100 times

In [None]:
NUM_WORKERS = 9
NUM_TOPICS = 7
lda_models = {}
for i in range(100):
    fn = "model_{}.gensim".format(str(i))
    ldamodel = models.LdaMulticore(corpus, num_topics=NUM_TOPICS, id2word=dictionary, passes=15, workers=NUM_WORKERS)
    path_sample_i = path_sample / fn
    ldamodel.save(str(path_sample_i))
    perplexity = ldamodel.log_perplexity(corpus)
    coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    write_sample(path_coherence.format("sample"), i, coherence, perplexity)
    print("Coherence: {}, Perplexity {}".format(coherence, perplexity))

In [None]:
n_topics = [6, 8]
for i in n_topics:
    n = "model_LDA_" + str(i)
    fn = n + ".gensim"
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics = i, id2word=dictionary, passes=15)
    path_model_i = path_model / fn
    ldamodel.save(str(path_model_i))
    topics = ldamodel.print_topics(num_words=NUM_WORDS)
    perplexity = ldamodel.log_perplexity(corpus)
    coherence_model = CoherenceModel(model=ldamodel, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    lda_models[n] = {'model' : ldamodel, 'coherence' : coherence, 'perplexity': perplexity}
    print("LDA with {} topics".format(i))
    for topic in topics:
        print(topic)
    print("Coherence: {}, Perplexity {}".format(coherence, perplexity))
    print("\n")

Visualize the topics. See: https://www.objectorientedsubject.net/2018/08/experiments-on-topic-modeling-pyldavis/

In [None]:
pyLDAvis.display(pyLDAvis.gensim.prepare(lda_models["model_LDA_7"]["model"], corpus, dictionary, sort_topics=False))

## What is the Dominant Topic in each Post?

In [25]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=ldamodel, corpus=corpus)
df_topic_sents_keywords.info()
df_topic_sents_keywords.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25978 entries, 0 to 25977
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Dominant_Topic     25978 non-null  float64
 1   Perc_Contribution  25978 non-null  float64
 2   Topic_Keywords     25978 non-null  object 
dtypes: float64(2), object(1)
memory usage: 609.0+ KB


Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords
0,3.0,0.5733,"school, kid, need, dc, sn, child, anyone, woul..."
1,3.0,0.4866,"school, kid, need, dc, sn, child, anyone, woul..."
2,3.0,0.8619,"school, kid, need, dc, sn, child, anyone, woul..."
3,2.0,0.6092,"ds, get, kid, like, say, want, im, he, go, time"
4,2.0,0.6514,"ds, get, kid, like, say, want, im, he, go, time"


In [87]:
import sqlite3
path_db = str(path_parent / "database" / "youbemom-merged.db")
sql = '''
    SELECT s.text_no_url AS text_no_url, s.text as text
    FROM sentiment AS s
    JOIN posts AS p
    ON s.message_id = p.message_id
    WHERE p.subforum="special-needs" AND p.parent_id=""
'''
conn = create_connection(path_db)
sn = pd.read_sql_query(sql, conn)
sn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25978 entries, 0 to 25977
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   text_no_url  25978 non-null  object
 1   text         25978 non-null  object
dtypes: object(2)
memory usage: 406.0+ KB


In [88]:
sn['dom_topic'] = df_topic_sents_keywords['Dominant_Topic']

In [89]:
sn['dom_topic'].value_counts()

2.0    10843
3.0     8336
1.0     3579
0.0     2659
4.0      314
5.0      144
6.0      103
Name: dom_topic, dtype: int64

In [39]:
for i in range(len(sn.index)):
    if sn['dom_topic'].iloc[i]==4.0:
        print("topic 4: " + sn['text_no_url'].iloc[i])
    if sn['dom_topic'].iloc[i]==6.0:
        print("topic 6: " + sn['text_no_url'].iloc[i])

topic 4: Any Corlears parents on? 
topic 4: I’m going to send my DC’s teachers and principal flowers. A bouquet of poison ivy blossoms, hemlock flowers, and deadly nightshade. That is all.
topic 4: Glutten free diet did anyone tried glutten free diet or change their autistic son diet, did it work?
topic 4: Pediatric ophthalmologist help needed: SUNY/Dr. Schulman, Jeffrey Cooper, Steven Larson at University Optometric, Giordano or Sangani??? Please help. 
topic 6: Your resume what does CV mean? 
topic 4: This is fantastic 
topic 6: Buy Feroza Stone - Gem Selections Turquoise or feroza stone is quite popular as Bollywood superstar Salman Khan wears it. Feroza Stone stimulates perception and increases focus in person. To buy feroza stone you can get in touch with our experts at 9999136878 or drop your query at care@khannagems.com.
topic 4: BEX CREAM FOR HIPS AND BUMS ENLARGEMENT 0835121053 YODI PILLS AND BOTCHO CREAMS FOR HIPS AND BUMS ENLARGEMENTS…+27835121053 New on the market Yodi Crea

In [33]:
sn['text_no_url'].loc[sn['dom_topic']==5.0].head(10)

129                         The Looming Tower is so good 
549     Anyone see the NYT article on Hans Aspergers? ...
598     Is greek yogurt, a banana and raspberries in a...
657                                                 Kate 
993     How actively involved is Micaela Bracamonte at...
1253    Tragic article about the Stoneman Douglas shoo...
2285    "If the US has someone who historians will loo...
2367    Prices & Uses of hager werken (+27640518120) e...
2421    How can a neropsych possible be $5500? It’s in...
2487    Aspie 8th grader got attacked for being racist...
Name: text_no_url, dtype: object

In [113]:
def has_url(df):
    pattern = r'(http|ftp|https):\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)'
    regex_pat = re.compile(pattern, flags=re.IGNORECASE)
    df['has_url'] = df['text'].str.contains(regex_pat)
    return df

In [118]:
path_spam = str(path_parent / "clean_data" / "spam_words.csv")

In [119]:
def has_word(df, word):
    regex_pat = re.compile(word.lower(), flags=re.IGNORECASE)
    df[word] = df['text'].str.contains(regex_pat)
    return df

In [129]:
spam = pd.read_csv(path_spam)
spam = spam['words'].tolist()

In [134]:
for s in spam:
    sn = has_word(sn, s)
    print(s)
    print(sn[s].value_counts())

black magic
False    25948
True        30
Name: black magic, dtype: int64
918728849451
False    25946
True        32
Name: 918728849451, dtype: int64
vashikaran
False    25921
True        57
Name: vashikaran, dtype: int64
enlargement
False    25974
True         4
Name: enlargement, dtype: int64
pills
False    25940
True        38
Name: pills, dtype: int64
creams
False    25946
True        32
Name: creams, dtype: int64
27835121053
False    25974
True         4
Name: 27835121053, dtype: int64
bollywood
False    25977
True         1
Name: bollywood, dtype: int64
9999136878
False    25977
True         1
Name: 9999136878, dtype: int64
botcho cream
False    25974
True         4
Name: botcho cream, dtype: int64
yodi pills
False    25974
True         4
Name: yodi pills, dtype: int64
enlarge
False    25971
True         7
Name: enlarge, dtype: int64
penis
False    25968
True        10
Name: penis, dtype: int64
stamina
False    25960
True        18
Name: stamina, dtype: int64
semen
False    25790

In [133]:
sn['testosterone'].value_counts()

False    25975
True         3
Name: testosterone, dtype: int64

In [121]:
sn.head()

Unnamed: 0,text_no_url,text,dom_topic,has_url,magic
0,Son has mild Sensory Issuesany privates in Man...,Son has mild Sensory Issuesany privates in Man...,3.0,False,False
1,Moms of older dc. School question. Our ds's be...,Moms of older dc. School question. Our ds's be...,3.0,False,False
2,The asd diagnostic criteria no longer fits my ...,The asd diagnostic criteria no longer fits my ...,3.0,False,False
3,15 yo has very explosive behavior at home only...,15 yo has very explosive behavior at home only...,2.0,False,False
4,Random question but do you know of friends or ...,Random question but do you know of friends or ...,2.0,False,False


  return func(self, *args, **kwargs)


Unnamed: 0,text_no_url,text,dom_topic,has_url
0,Son has mild Sensory Issuesany privates in Man...,Son has mild Sensory Issuesany privates in Man...,3.0,False
1,Moms of older dc. School question. Our ds's be...,Moms of older dc. School question. Our ds's be...,3.0,False
2,The asd diagnostic criteria no longer fits my ...,The asd diagnostic criteria no longer fits my ...,3.0,False
3,15 yo has very explosive behavior at home only...,15 yo has very explosive behavior at home only...,2.0,False
4,Random question but do you know of friends or ...,Random question but do you know of friends or ...,2.0,False


In [123]:
sn['magic'].value_counts()

False    25901
True        77
Name: magic, dtype: int64

In [122]:
print(sn['text'].loc[sn['magic']])

489      SN school wants us to put DC on meds for behav...
575      REading Comp Help for HS student - My high sch...
1156     I have an ADHD/ODD son that is medicated (Zolo...
2505     Urgent love spell caster to bring back your Ex...
3306     Has anyone found a magic potion for ADHD med w...
                               ...                        
23432    DS7yo 1st grader has b een overly physical wit...
24438    Moms of kids with anxiety-adhd DX, please tell...
25662    Seeking Quillavant Substitute Please Thanks to...
25761    Anyone who has college age or older kids with ...
25779    My pubescent children attend the same SN schoo...
Name: text, Length: 77, dtype: object


In [117]:
print(sn['text'].loc[39])

Rich Disabled Kids Get the City to Send Them to Private School. Poor Disabled Kids Get Screwed. https://www.youtube.com/watch?time_continue=7&v=Jx3Q64Exh74


## Save Model Topics and Keywords in New Database

In [None]:
conn = sqlite3.connect(path_db)
df_topic_sents_keywords.to_sql('topicmodel', conn, if_exists='replace', index=False)