In [24]:
import math
import pandas as pd
from collections import Counter
import operator
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))
import gensim
from gensim import corpora


def read_file(file_path, max_amount=math.inf):
    """
    Reads a number of lines from file and return list of lines
    :param file_path:
    :param max_amount:
    :return: list[str]
    """
    line_list = []

    with open(file=file_path) as f:
        curr_idx = 0
        while curr_idx < max_amount:
            line_list.append(f.readline())
            curr_idx += 1

    return line_list


def process_word(word):
    """
    Returns process word
    :param word:
    :return: str word
    """
    # lower
    word = word.lower()

    # Remove stop
    """
    if word in en_stop:
        return None

    # Remove symbols
    if len(word) < 3:
        return None
    
    # Get morph
    t_word = wn.morphy(word)
    if t_word is not None:
        word = t_word

    # Get lemma
    word = WordNetLemmatizer().lemmatize(word)
    """
    return word


def process_bpe(line_list, seq="@@", rm_words=["\n"]):
    """
    Transforms line list form read_file to list[list[str=words]], and bpe words merged together
    :param line_list:
    :param seq:
    :param rm_words:
    :return: ist[list[str=words]]
    """
    full_word_list = []
    for line in line_list:
        temp_words = line.split()
        full_words = []
        idx = 0
        while idx < len(temp_words):
            if temp_words[idx].endswith(seq):
                temp_str = ""
                while temp_words[idx].endswith(seq):
                    temp_str += temp_words[idx][:-(len(seq))]
                    idx += 1
                temp_str += temp_words[idx]
                if temp_str not in rm_words:
                    w = process_word(temp_str)
                    if w is not None:
                        full_words.append(w)
                idx += 1
            else:
                if temp_words[idx] not in rm_words:
                    w = process_word(temp_words[idx])
                    if w is not None:
                        full_words.append(w)
                idx += 1
        full_word_list.append(full_words)
    return full_word_list


def get_word_count(word_list):
    """
    Returns word count.
    :param word_list:
    :return: List[(str=word, int=count)]
    """
    word_count = {}
    for line in word_list:
        for word, val in Counter(line).most_common():
            if word in word_count:
                word_count[word] += val
            else:
                word_count[word] = val

    return sorted(word_count.items(), key=operator.itemgetter(1))


def get_topics_lda(word_list, num_topics=10):
    dictionary = corpora.Dictionary(word_list)
    corpus = [dictionary.doc2bow(text) for text in word_list]

    ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=15)
    topics = ldamodel.print_topics(num_words=10)
    for topic in topics:
        print(topic)

    return topics, ldamodel, corpus, dictionary

        
l = read_file('./trg.shuf', 1000)
wl = process_bpe(line_list=l)
print(get_word_count(word_list=wl))
t, ldamodel,_, dictionary = get_topics_lda(word_list=wl)




[nltk_data] Downloading package wordnet to /home/nikita/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/nikita/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('resurrection', 1), ('christ', 1), ('assures', 1), ('sinless', 1), ('satan', 1), ('atoning', 1), ('sacrifice', 1), ('accepted', 1), ('eternity', 1), ('polish', 1), ('tatra', 1), ('mountains', 1), ('chapel', 1), ('jaszczurówka', 1), ('breathtaking', 1), ('panorama', 1), ('tatras', 1), ('admired', 1), ('gubałówka', 1), ('handcraft', 1), ('houses', 1), ('chochołów', 1), ('hospitality', 1), ('górale', 1), ('yourselves', 1), ('inn', 1), ('exceed', 1), ('elixir', 1), ('combined', 1), ('multiple', 1), ('passionfruit', 1), ('honey', 1), ('ginger', 1), ('healing', 1), ('preventive', 1), ('powerhouses', 1), ('towns', 1), ('andratx', 1), ('mar', 1), ('paguera', 1), ('sant', 1), ('elmo', 1), ('ponsa', 1), ('porto', 1), ('portals', 1), ('bendinat', 1), ('came', 1), ('stimulating', 1), ('workshops', 1), ('urgent', 1), ('facing', 1), ('planet', 1), ('crucially', 1), ('sahel', 1), ('frick', 1), ('representative', 1), ('un', 1), ('session', 1), ('engage', 1), ('dialogues', 1), ('careful', 1), ('talki

(0, '0.039*"the" + 0.027*"." + 0.019*"," + 0.012*"and" + 0.012*"of" + 0.011*"to" + 0.010*":" + 0.010*"a" + 0.010*"on" + 0.009*"is"')
(1, '0.044*"-" + 0.040*"/" + 0.032*"," + 0.024*"€" + 0.017*"7" + 0.016*":" + 0.015*"nights" + 0.011*"sat" + 0.011*"16" + 0.010*"%"')
(2, '0.047*"," + 0.036*"the" + 0.028*"." + 0.020*"a" + 0.017*"and" + 0.017*"to" + 0.017*"of" + 0.010*"is" + 0.009*"in" + 0.008*"for"')
(3, '0.077*"the" + 0.034*"," + 0.032*"of" + 0.032*"." + 0.018*"in" + 0.016*"and" + 0.012*"to" + 0.011*")" + 0.011*"(" + 0.010*"by"')
(4, '0.117*":" + 0.051*"," + 0.026*"—" + 0.012*"11" + 0.012*"am" + 0.009*"10" + 0.009*"15" + 0.009*"on" + 0.008*"12" + 0.008*"."')
(5, '0.087*"/" + 0.022*":" + 0.021*"." + 0.021*"and" + 0.014*"http" + 0.012*"," + 0.011*"you" + 0.009*"that" + 0.007*"is" + 0.007*"the"')
(6, '0.046*"the" + 0.045*"," + 0.043*"." + 0.028*"to" + 0.021*"of" + 0.020*"in" + 0.019*"and" + 0.018*"a" + 0.014*"is" + 0.009*"with"')
(7, '0.023*""" + 0.015*"." + 0.011*"is" + 0.008*"we" + 0.008*

In [25]:
import pyLDAvis.gensim
from gensim import corpora
dictionary = corpora.Dictionary(wl)
corpus = [dictionary.doc2bow(text) for text in wl]
lda_display = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [26]:
def get_representatives_lda(word_list, lda_model, dictionary, num_topics=10):
    max_line = [[]] * num_topics
    max_class = [0] * num_topics 
    
    for line in word_list:
        new_doc_bow = dictionary.doc2bow(line)
        s = ldamodel.get_document_topics(new_doc_bow)
        s.sort(key=lambda x: x[1])
        t_max_class, t_max_perc = s[-1]
        if max_class[t_max_class] < t_max_perc:
            max_class[t_max_class] = t_max_perc
            max_line[t_max_class] = line
    
    return max_line, max_class

max_l, max_c = get_representatives_lda(wl, ldamodel, dictionary)

for idx in range(len(max_l)):
    print("\n")
    print("\n")
    print(max_c[idx])
    print(max_l[idx])
    





0.9790669
['fischer', 'himself', 'is', 'so', 'enthusiastic', 'and', 'committed', 'to', 'the', 'hitachi', 'tool', 'production50', '®', 'concept', 'that', 'he', 'says', ',', '“', 'if', 'it', 'didn', '’', 't', 'sound', 'so', 'much', 'like', 'an', 'advertising', 'promotion', ',', 'we', 'could', 'promise', 'each', 'customer', 'a', 'money-back', 'guarantee', '”', '.']




0.9608688
['900', '€', '4d', '110', 'm', '&', 'sup2', '-', 'alquiler', 'piso', 'en', 'zona', 'penyaroja', 'valencia', 'refe.11068', 'alquiler', 'de', 'pisos', 'y', 'apartamentos', 'nuevo', '!']




0.9849983
['could', 'bergdahl', 'be', 'a', 'symbol', 'of', 'national', 'angst', 'that', 'time', 'is', 'on', 'the', 'taliban', "'s", 'side', 'and', 'that', 'the', 'now-scheduled', 'u.s.', 'departure', 'from', 'afghanistan', '--', 'which', 'indirectly', 'hastened', 'bergdahl', "'s", 'release', '--', 'makes', 'the', 'sacrifices', 'in', 'lives', ',', 'casualties', ',', 'treasure', ',', 'and', 'spirit', 'america', 'devoted', 'to',