In [None]:
import os
import re

FOLDER_PATH = '/xxx/preprocessing_data'

def open_txt_file(filename):
    file_path = os.path.join(FOLDER_PATH, filename)
    with open(file_path, 'r') as f:
        return f.read()

def clean_and_save(text, filename, cleaners):
    cleaned_text = text
    for cleaner in cleaners:
        cleaned_text = cleaner(cleaned_text)
    
    new_filepath = os.path.join(FOLDER_PATH, f'{filename[:-4]}Clean.txt')
    
    with open(new_filepath, 'w') as f:
        f.write(cleaned_text)
        
    print(f"Cleaned text saved to {new_filepath}!")
    return cleaned_text

def remove_hyphen(text):
    return re.sub(r'-\n', '', text)

def remove_revista_azul(text):
    return re.sub(r'(\n\d+\n[Rr][Ee][Vv][Ii][Ss][Tt][Aa] [Aa][Zz][Uu][Ll]|\n[Rr][Ee][Vv][Ii][Ss][Tt][Aa] [Aa][Zz][Uu][Ll]\n\d+)\n', '\n', text)

def remove_page_headers(text):
    text = re.sub(r'\n\n## p\. \d+ \(#\d+\) ##+\n\n---', '', text)
    text = re.sub(r'## p\. \d+ \(#\d+\) ##+', '', text)
    return re.sub(r'', '', text)

def remove_incomplete_page_headers(text):
    return re.sub(r'## p\. \(#\d+\) ##+', '', text)

def remove_excessive_linebreaks(text):
    return re.sub(r'\n{2,}', '\n', text)

def remove_double_linebreaks(text):
    return re.sub(r'\n\n', '\n', text)

def remove_linebreaks(text):
    return re.sub(r'\n', ' ', text)

def remove_all_caps_revista_azul(text):
    return re.sub(r'[Rr][Ee][Vv][Ii][Ss][Tt][Aa] [Aa][Zz][Uu][Ll]', '', text, flags=re.I)


filenames = ['RevAz1.txt', 'RevAz2.txt']

cleaners = [
    remove_hyphen,
    remove_revista_azul,
    remove_page_headers,
    remove_incomplete_page_headers,
    remove_excessive_linebreaks,
    remove_double_linebreaks,
    remove_linebreaks,
    remove_all_caps_revista_azul,
]

for filename in filenames:
    text = open_txt_file(filename)
    clean_and_save(text, filename, cleaners)

In [None]:
import openai

with open('/xxx', 'r', encoding='utf-8') as file:
    input_text = file.read()

max_tokens = 8000

chunks = [input_text[i:i+max_tokens] for i in range(0, len(input_text), max_tokens)]

openai.api_key = 'xxx'

instruction = "You are a language model assistant. You are asked to correct the spelling mistakes in the text. You are not allowed to add anything new to the text. You're only allowed to correct the spelling mistakes. It's a XIXth century magazine from Mexico. It's mainly in Spanish, but I sometimes has some words in other languages like French and English. You will correct the spelling mistakes in Spanish and leave the words in other languages as they are. If you find the series of caracters '***', don't change it."

for i, chunk in enumerate(chunks):
    messages = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": "N un volante azul que me envía el regente de la imprenta leo estas palabras escritas con lápiz: falta el programa. Calle! Es verdad. Ni mi amigo ni yo pensamos nunca en el programa. Deberoia ir á mi cass por él? ***"},
        {"role": "assistant", "content": "En un volante azul que me envía el regente de la imprenta leo estas palabras escritas con lápiz: falta el programa. ¡Calle! Es verdad. Ni mi amigo ni yo pensamos nunca en el programa. ¿Debería ir a mi casa por él? ***"},
        {"role": "user", "content": chunk},        
    ]
    
    response = openai.ChatCompletion.create(model='gpt-3.5-turbo-16k', messages=messages, temperature=0.1)

    if response is not None:
        completion = response['choices'][0]['message']['content'] + " "
        with open('/xxx/preprocessing_data/RevAzSpellChecked.txt', 'a', encoding='utf-8') as file:
            file.write(completion)
    else:
        print(f'Error: Failed to get response')

In [None]:
import spacy
from spacy.lang.es.stop_words import STOP_WORDS

def open_txt_file(filename):
    folder_path = '/xxx/preprocessing_data'
    file_path = os.path.join(folder_path, filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def split_into_texts(text):
    return text.split('***')

def sentencize_texts(texts, nlp):
    return [[sent.text for sent in nlp(text).sents] for text in texts]

def remove_punctuation(sentencized_texts):
    return [[re.sub(r'[^\w\s]', '', sentence) for sentence in text] for text in sentencized_texts]

def lemmatize_texts(cleaned_texts, nlp):
    return [[' '.join([token.lemma_ for token in nlp(sentence)]) for sentence in text] for text in cleaned_texts]

def remove_short_and_stop_words(lemmatized_texts):
    return [[' '.join([word for word in sentence.split() if len(word) > 2 and word not in STOP_WORDS]) for sentence in text] for text in lemmatized_texts]

def sentence_to_words(final_texts):
    return [[sentence.split() for sentence in text] for text in final_texts]

def convert_to_lowercase(texts_words):
    return [[[word.lower() for word in sentence] for sentence in text] for text in texts_words]

def preprocess_texts(filename):
    # Load language models
    sentencizer = spacy.load('xx_sent_ud_sm')
    lemmatizer = spacy.load('es_dep_news_trf')

    text = open_txt_file(filename)
    texts = split_into_texts(text)
    sentencized_texts = sentencize_texts(texts, sentencizer)
    cleaned_texts = remove_punctuation(sentencized_texts)
    lemmatized_texts = lemmatize_texts(cleaned_texts, lemmatizer)
    final_texts = remove_short_and_stop_words(lemmatized_texts)
    texts_words = sentence_to_words(final_texts)
    
    return convert_to_lowercase(texts_words)

def prepare_for_training(lower_texts_words):
    return [sentence for text in lower_texts_words for sentence in text]

lower_texts_words = preprocess_texts('RevAzSpellChecked.txt')

flattened_texts_words = prepare_for_training(lower_texts_words)


In [None]:
from gensim.models import Word2Vec

# def train_word2vec(flattened_texts_words, min_count=1):
#     model = Word2Vec(flattened_texts_words, min_count=1)
#     model.save("models/word2vec.model")

# train_word2vec(flattened_texts_words, min_count=1)

def train_word2vec(flattened_texts_words, vector_size=100, window=5, min_count=1, sg=1):
    model = Word2Vec(flattened_texts_words, vector_size=vector_size, window=window, min_count=min_count, sg=1)
    model.save("models/word2vec.model")

train_word2vec(flattened_texts_words, vector_size=100, window=5, min_count=5, sg=1)

# from gensim.models import Word2Vec

# def train_word2vec(flattened_texts_words, vector_size=10, window=3, min_count=5, workers=4, sg=1, alpha=0.03, min_alpha=0.0007, negative=20):
#     model = Word2Vec(flattened_texts_words, vector_size=vector_size, window=window, min_count=min_count, workers=workers, sg=sg, alpha=alpha, min_alpha=min_alpha, negative=negative)
#     model.save("models/word2vec.model")

# train_word2vec(flattened_texts_words, vector_size=500, window=2, min_count=5, workers=8, sg=1, alpha=0.03, min_alpha=0.0007, negative=20)


In [None]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pickle

def documents_to_tagged_documents(lower_texts_words):
    tagged_documents = [TaggedDocument(words=[word for sentence in text for word in sentence], tags=[i]) for i, text in enumerate(lower_texts_words)]
    return tagged_documents

tagged_documents = documents_to_tagged_documents(lower_texts_words)

def train_doc2vec(tagged_documents, vector_size=150, window=10, min_count=2, workers=8, epochs=20, dm=0, alpha=0.03, min_alpha=0.0007, sample=1e-3):
    model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, workers=workers, epochs=epochs, dm=dm, alpha=alpha, min_alpha=min_alpha, sample=sample)
    model.build_vocab(tagged_documents)
    model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.epochs)
    model.save("models/doc2vec.model")

    with open('preprocessing_data/tagged_documents.pkl', 'wb') as f:
        pickle.dump(tagged_documents, f)

train_doc2vec(tagged_documents)

In [None]:
import json

def save_tagged_documents(tagged_documents, filename='preprocessing_data/tagged_documents.json'):
    tagged_documents_dict = [{"tags": doc.tags, "words": doc.words} for doc in tagged_documents]

    with open(filename, 'w') as f:
        json.dump(tagged_documents_dict, f)

save_tagged_documents(tagged_documents)

In [None]:
import numpy as np

# Train Doc2Vec models
model_dbow = Doc2Vec(tagged_documents, vector_size=75, window=10, min_count=2, workers=8, epochs=20, dm=0, alpha=0.03, min_alpha=0.0007, sample=1e-3)
model_dm = Doc2Vec(tagged_documents, vector_size=75, window=10, min_count=2, workers=8, epochs=20, dm=1, alpha=0.03, min_alpha=0.0007, sample=1e-3)

vectors_dbow = model_dbow.dv.vectors
vectors_dm = model_dm.dv.vectors

vectors_combined = np.hstack((vectors_dbow, vectors_dm))


np.save('models/combined_vectors.npy', vectors_combined)

In [None]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.corpora import MmCorpus

def flatten_texts(texts_words):
    return [[word for sentence in text for word in sentence] for text in texts_words]

def create_dictionary_corpus(flattened_texts_words):
    dictionary = Dictionary(flattened_texts_words)
    dictionary.filter_extremes(no_below=20, no_above=0.7)
    corpus = [dictionary.doc2bow(text) for text in flattened_texts_words]
    return dictionary, corpus

def lda_topic_modeling(dictionary, corpus, num_topics=15, passes=25):
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes)
    topics = lda.print_topics(num_words=20)
    return topics, lda

def topic_modeling_pipeline(texts_words):
    flattened_texts_words = flatten_texts(texts_words)
    dictionary, corpus = create_dictionary_corpus(flattened_texts_words)
    # Perform LDA topic modeling
    topics, lda = lda_topic_modeling(dictionary, corpus)
    for topic in topics:
        print(topic)
    lda.save("models/lda_model")  
    dictionary.save("models/dictionary")
    MmCorpus.serialize("models/corpus.mm", corpus)

topic_modeling_pipeline(lower_texts_words)