Load documents and store them as a list of lists

In [None]:
import math
path = "../../data/lem"
year = "2010_2019"
category = "N_A"
num_topics = 20
file_name = year+"_"+category+"_no_names_beletrie"
file_path = path+"/"+file_name + ".txt"

def load_books_1000_blocks_from_document(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    books = []
    books_info = []
    start_tag = '<doc title="'
    end_tag = '</doc>'
    start_index = 0

    while True:
        book_start = content.find(start_tag, start_index)
        if book_start == -1:
            break

        book_end = content.find(end_tag, book_start)
        if book_end == -1:
            break

        book_text = content[book_start:book_end + len(end_tag)]
        book_info = book_text.strip()[5:book_text.index('>') + 1]  # Remove '<doc' and '</doc>'

        book_info_list = book_info.split('" ')
        book_info_dict = {}

        for item in book_info_list:
            key, value = item.split('=')
            book_info_dict[key.strip()] = value.strip('"')

        book_content = book_text[book_text.index('>') + 1:-len(end_tag)].strip()
        book_content = book_content.split(' ')
        length = len(book_content)
        CONST = 1000
        for i in range(math.ceil(length/CONST)):
            i = i*CONST
            end = i+CONST if i+CONST < length-1 else length-1
            books.append(" ".join(book_content[i:end]))
            books_info.append(book_info_dict)

        start_index = book_end + len(end_tag)

    return books, books_info

books, books_info = load_books_1000_blocks_from_document(file_path)

Preprocess the data

In [None]:
from gensim import corpora
import nltk
from nltk.tokenize import word_tokenize



# Tokenize the documents
tokenized_data = [word_tokenize(doc) for doc in books]

# Create a dictionary from the preprocessed data
dictionary = corpora.Dictionary(tokenized_data)

# Create a document-term matrix
doc_term_matrix = [dictionary.doc2bow(doc) for doc in tokenized_data]


Perform LDA

In [None]:
from gensim.models import LdaModel
from gensim.test.utils import datapath
passes = 10
alpha = 'auto'
eta = 'auto'
# Train the LDA model
# LDA - mixture of topics
# Iterative Bayesian proces  
lda_model = LdaModel(
    corpus=doc_term_matrix,
    id2word=dictionary,
    num_topics=num_topics,  # Number of topics
    passes=passes,      # Number of iterations
    alpha = alpha, # scalar for a symmetric prior over document-topic distribution
    eta = eta
)

save_path = "../../data/models/1000 chunk/lda/lda_" + file_name
temp_file = datapath(save_path)

lda_model.save(temp_file)


Visualization LDA

In [None]:
import pyLDAvis.gensim
from gensim import  models
from gensim.test.utils import datapath

save_path = "../../data/models/1000 chunk/lda/lda_{num_topics}_topics".format(num_topics = num_topics) + file_name
temp_file = datapath(save_path)


lda_model = models.ldamodel.LdaModel.load(temp_file)

topic_words = lda_model.print_topics(num_topics = 20, num_words = 20)

with open("../../data/lda_20_topics_{year}.txt".format(year = year), "w", encoding = 'utf8') as output:
    for row in topic_words:
        output.write(str(row) + '\n')
print(topic_words)
# Visualize the LDA model
vis = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
file_plot = '../../plots/Python/LDA/lda'+ '_' + year + '_' + category + '_'  + str(num_topics) + '_topics' +  '.html'
pyLDAvis.save_html(vis, file_plot)
pyLDAvis.display(vis)


In [None]:
import pandas as pd
from gensim import  models
from gensim.test.utils import datapath

save_path = "../../data/models/1000 chunk/lda/lda_{num_topics}_topics".format(num_topics = num_topics) + file_name
temp_file = datapath(save_path)


lda_model = models.ldamodel.LdaModel.load(temp_file)

# Create the topic-document matrix
topic_doc_matrix = lda_model.get_document_topics(doc_term_matrix)

# Convert the topic-document matrix to a format that pyLDAvis understands
data = []
for i, doc_topics in enumerate(topic_doc_matrix):
    topic_contributions = [0] * lda_model.num_topics
    for topic_id, topic_prob in doc_topics:
        topic_contributions[topic_id] = topic_prob
    data.append({
        'title': books_info[i]['title'],
        'author': books_info[i]['author'],
        'publisher': books_info[i]['publisher'],
        'first_published': books_info[i]['first_published'],
        'authsex': books_info[i]['authsex'],
        'topic_contributions': topic_contributions
        }) 

df = pd.DataFrame.from_dict(data, orient = 'columns')

document_name = df['title'][0]
info = [df['author'][0], df['publisher'][0], df['first_published'][0], df['authsex'][0]]
count = 0
topic_distribution = [0 for i in range(num_topics)]
td_dict = {}
for _, row in df.iterrows():
    if row['title'] != document_name:
        td_dict[document_name] = info + [(i/count)*100 for i in topic_distribution]
        info = [row['author'], row['publisher'], row['first_published'], row['authsex']]
        topic_distribution = row['topic_contributions']
        document_name = row['title']
        count = 1
    else:
        lists_of_lists = [topic_distribution, row['topic_contributions']]
        topic_distribution = [sum(i) for i in zip(*lists_of_lists)]  
        count += 1 
td_dict[document_name] = info + topic_distribution 

df_LDA = pd.DataFrame(data = td_dict)
df_LDA = df_LDA.T
df_LDA.to_excel("../../data/topics/lda/lda_{num_topics}_{year}topics.xlsx".format(num_topics = num_topics, year = year) )
print(td_dict)


