In [None]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from ast import literal_eval

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
THEMES = [5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589,
          597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975]
TRAIN_DATA_PATH = '../train.csv'
TEST_DATA_PATH = '../test.csv'
VALIDATION_DATA_PATH = '../validation.csv'

In [None]:
def get_data(path, preds=None, key=None):
    data = pd.read_csv(path)
    data = data.rename(columns={ 'pages': 'page'})
    data.body = data.body.str.strip('{}"')
    data = groupby_process(data)
    data.themes = data.themes.apply(lambda x: literal_eval(x))
    return data

In [None]:
def groupby_process(df):
    new_df = df.sort_values(['process_id', 'page'])
    new_df = new_df.groupby(
                ['process_id', 'themes'],
                group_keys=False
            ).apply(lambda x: x.body.str.cat(sep=' ')).reset_index()
    new_df = new_df.rename(index=str, columns={0: "body"})
    return new_df

In [None]:
train_data = get_data(TRAIN_DATA_PATH)
test_data = get_data(TEST_DATA_PATH)
validation_data = get_data(VALIDATION_DATA_PATH)

In [None]:
train_data.themes = train_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
test_data.themes = test_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
validation_data.themes = validation_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))

In [None]:
len(train_data), len(validation_data), len(test_data)

In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield([x for x in sentence.split(" ") if len(x) > 1])

train_words = list(sent_to_words(train_data.body.tolist()))


print(train_words[:1])

In [None]:
len(train_words)

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(train_words)

In [None]:
def save_dic(dic, filename="dic"):
    with open(filename, "wb") as handle:
        dic.save(handle)

In [None]:
save_dic(id2word, "big_dict")

In [None]:
id2word = corpora.Dictionary.load("dicts/big_dict")

In [None]:
len(id2word.dfs), id2word.dfs

In [None]:
[(id2word[x], y) for (x, y) in sorted(id2word.dfs.items(), key=lambda x: x[1], reverse=True)]

In [None]:
train_corpus = [id2word.doc2bow(text) for text in train_words]

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=train_corpus,
                                           id2word=id2word,
                                           num_topics=50, 
                                           random_state=42,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
pprint(lda_model.print_topics())

In [None]:
lda_model.save("lda_big_30")

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, train_corpus, id2word)
vis