In [None]:
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

all_lemmas_test = []

for folder in tqdm(['1', '2', '3', '4']):
    folder_path = os.path.join('assets/annotated_corpus/test/', folder)

    for file in os.listdir(folder_path):

        if file.endswith('.tsv') and file.startswith('annotation'):

            file_path = os.path.join(folder_path, file)

            df = pd.read_csv(file_path, sep='\t', header=None)

            lemma_list = df[0].tolist()
            sentence_lemmas = []
            for lemma in lemma_list:
                if str(lemma) != 'nan':
                    sentence_lemmas.append(lemma)
                else:
                    all_lemmas_test.append(sentence_lemmas)
                    sentence_lemmas = []

            if len(sentence_lemmas) > 0:
                all_lemmas_test.append(sentence_lemmas)

In [None]:
import pickle
with open("term_document_matrix", "rb") as fp:   # Unpickling
    term_document_matrix = pickle.load(fp)
    
with open("terms", "rb") as fp:   # Unpickling
    terms = pickle.load(fp)

In [None]:
def create_annotation(doc_topic_dist, n_comps):
    
    output_filename = f"assets/annotated_corpus/test/probs_topics_{n_comps}.tsv"
    
    with open(output_filename, "w", encoding="utf-8") as f:
        
        for index, probs in zip(range(len(doc_topic_dist)),doc_topic_dist):

                    f.write(f"{index}\t") 

                    for prob in probs:
                        f.write(f"{prob}\t")

                    f.write("\n")  

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm, trange
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

def lda_for_comps(lda_comp, term_document_matrix, max_iter):
    lda = LatentDirichletAllocation(n_components=lda_comp, random_state=0, max_iter=max_iter)
    lda.fit(term_document_matrix)
    
    topic_words = {}
    n_top_words= 10
    
    
    doc_topic_dist = lda.transform(term_document_matrix)

    for topic, comp in enumerate(lda.components_):  

        word_idx = np.argsort(comp)[::-1][:n_top_words]

        topic_words[topic] = [terms[i] for i in word_idx]
        
        
        
        with open(f'lab4/top_words_{lda_comp}_maxiter_{max_iter}', "w", encoding="utf-8") as f:
            
            top_docs = np.argmax(doc_topic_dist, axis=0)
            
            for topic, words in topic_words.items():
                f.write('Topic: %d \n' % topic)
                f.write('Top words: %s' % ', '.join(words))
                f.write('\n')
                f.write('Top text:\n'+' '.join(all_lemmas_test[top_docs[topic]]) +'\n\n')
            
            
                        
        
                
    
        
            
        
    create_annotation(doc_topic_dist, lda_comp)
                
                
    return lda.perplexity(term_document_matrix)


In [None]:
lda_comps = [2, 4, 5, 8, 10, 15, 20, 25, 30, 35, 40]

In [None]:
perplexity = [lda_for_comps(lda_comp, term_document_matrix, max_iter=10) for lda_comp in tqdm(lda_comps)]

In [None]:
perplexity

In [None]:
plt.figure(dpi=280, figsize=(8,4))

plt.plot(lda_comps, perplexity, color='red')

pol = np.poly1d(np.polyfit(lda_comps, perplexity, 5))

plt.plot(lda_comps, [pol(lda_comp) for lda_comp in lda_comps], color='blue')

plt.xlabel("n_comps")
plt.ylabel("perplexity")

plt.show()

#### В два раза меньше итераций

In [None]:
perplexity = [lda_for_comps(lda_comp, term_document_matrix, max_iter=5) for lda_comp in tqdm(lda_comps)]

In [None]:
plt.figure(dpi=280, figsize=(8,4))

plt.plot(lda_comps, perplexity, color='red')

pol = np.poly1d(np.polyfit(lda_comps, perplexity, 5))

plt.plot(lda_comps, [pol(lda_comp) for lda_comp in lda_comps], color='blue')

plt.xlabel("n_comps")
plt.ylabel("perplexity")

plt.show()

#### В два раза больше итераций

In [None]:
perplexity = [lda_for_comps(lda_comp, term_document_matrix, max_iter=20) for lda_comp in tqdm(lda_comps)]

In [None]:
plt.figure(dpi=280, figsize=(8,4))

plt.plot(lda_comps, perplexity, color='red')

pol = np.poly1d(np.polyfit(lda_comps, perplexity, 5))

plt.plot(lda_comps, [pol(lda_comp) for lda_comp in lda_comps], color='blue')

plt.xlabel("n_comps")
plt.ylabel("perplexity")

plt.show()