In [None]:
import sys
sys.path.append('../src')

In [None]:
import os
import codecs
import spacy
from gensim.models import Phrases
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import LineSentence
from utils import lemmatized_sentence_corpus, line_review, punct_space
from nltk.corpus import stopwords
nlp = spacy.load('de')

In [None]:
intermediate = '../intermediate'
unigram_sentences_filepath = os.path.join(intermediate, 'unigram_sentences_all.txt')
bigram_model_filepath = os.path.join(intermediate, 'bigram_model_all')
bigram_sentences_filepath = os.path.join(intermediate, 'bigram_sentences_all.txt')
trigram_model_filepath = os.path.join(intermediate, 'trigram')
trigram_sentences_filepath = os.path.join(intermediate, 'trigram_sentences_all.txt')
trigram_dictionary_filepath = os.path.join(intermediate, 'trigram_dict_all_diags.dict')
trigram_bow_filepath = os.path.join(intermediate, 'trigram_bow_corpus_all_diags.mm')
lda_model_filepath = os.path.join(intermediate, 'lda_model_all_diags')

In [None]:
with open('../src/output.txt', 'r') as f:
    data_file = f.readlines()

In [None]:
def unigram(infile, outfile):
    with codecs.open(outfile, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(infile):
            f.write(sentence + '\n')

In [None]:
%%time

if False:
    unigram(data_file, unigram_sentences_filepath)

In [None]:
import itertools as it 

unigram_sentences = LineSentence(unigram_sentences_filepath)

for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print(' '.join(unigram_sentence),  '\n')

In [None]:
%%time

if 1 == 0:
    bigram_model = Phrases(unigram_sentences)
    bigram_model.save(bigram_model_filepath)

# load
bigram_model = Phrases.load(bigram_model_filepath)

In [None]:
%%time

if 1 == 0:
    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf8') as f:
        for unigram_sentence in unigram_sentences:
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            f.write(bigram_sentence + '\n')
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [None]:
%%time
if 1 == 0:
    trigram_model = Phrases(bigram_sentences)
    trigram_model.save(trigram_model_filepath)

# load
trigram_model = Phrases.load(trigram_model_filepath)

In [None]:
if 1 == 0:
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf8') as f:
        for bigram_sentence in bigram_sentences:
            trigram_sentence = u' '.join(bigram_model[bigram_sentence])
            f.write(trigram_sentence + '\n')
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [None]:
de_stops = stopwords.words('german')
# extend by some custom words
de_stops.extend(["jedoch","sowie","datum"])
# include capitals
tmp_lst = []
for w in de_stops:
    tmp_lst.append(w.title())
de_stops.extend(tmp_lst)

In [None]:
%%time
if 1 == 0:
    # runs 4h
    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for parsed_record in nlp.pipe(line_review(data_file), batch_size=10000, n_threads=2):

            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_record if not punct_space(token)]

            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]

            # remove any remaining stopwords
            # list is from nltk
            trigram_review = [term for term in trigram_review if term not in de_stops]

            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')


In [None]:
%%time

if 1 == 0:
    trigram_reviews = LineSentence(trigram_sentences_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)

    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

In [None]:
%%time

def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """

    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

if 1 == 1:
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_sentences_filepath))

# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

In [None]:
%%time

if 1 == 1:
    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                       num_topics=50,
                       id2word=trigram_dictionary,
                       workers=3)

    lda.save(lda_model_filepath)

# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

In [None]:
def explore_topic(topic_number, topn=10):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """

    print(u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=20):
        print(u'{:20} {:.3f}'.format(term, round(frequency, 3)))


for i in range(1,50):
    print("Topic: " + str(i))
    explore_topic(topic_number=int(i))
    print("\n")
