# Generate Embeddings

In [None]:
# Load corpus json
import json

print('Load corpus.json')
with open('../../data/corpus.json/corpus.json', 'r') as f:
    documents = json.load(f)


In [None]:
# OPTIONAL: Only keep documents of given lang
import gc

documents_en = [doc for doc in documents if doc['lang'] == 'en']
del documents
documents = documents_en
del documents_en
gc.collect()

In [None]:
# OPTIONAL: only select a random subset of documents from corpus
import random
import gc

random.seed(42)
num_documents_to_select = 50000

selected_documents = random.sample(documents, num_documents_to_select)

# Print some examples to check
for i, doc in enumerate(selected_documents[:5]):
    print(f"Document {i+1}: {doc['docid']} - {doc['text'][:100]}...")

# Extract docids from the selected documents
selected_docids = [doc['docid'] for doc in selected_documents]

# Write the selected docids to a file
docids_file_path = 'selected_docids.json'
with open(docids_file_path, 'w') as f:
    json.dump(selected_docids, f)
print(f"Selected docids written to {docids_file_path}")

# Replace documents variable and clean memory
del documents
documents = selected_documents
del selected_documents
del selected_docids
gc.collect()

In [None]:
#  Create dict to match document index and corresponding docid
import gc
docids = [doc['docid'] for doc in documents]

doc_index_to_docid = {index: doc_id for index, doc_id in enumerate(docids)}
# write to disk
with open('doc_index_to_docid_en.json', 'w') as f: # EN !
    json.dump(doc_index_to_docid, f)

del doc_index_to_docid
gc.collect()

In [None]:
# Extract text from docs and TEXT PREPROCESSING APPLIED
from preprocessing import clean_text
print('Extract text from docs')
texts = [clean_text(doc['text']) for doc in documents] # PREPROCESSING APPLIED

del documents
gc.collect()


# # Save docs in temp file
# import tempfile
# print('Save docs texts in temp file')
# with tempfile.NamedTemporaryFile(delete=False) as temp_file:
#     temp_file_name = temp_file.name
#     for text in texts:
#         temp_file.write((text + '\n').encode('utf-8'))

# Save docs in a normal file
temp_file_name = 'preprocessed_texts_en.txt'
print('Save docs texts in a text file')
with open(temp_file_name, 'w', encoding='utf-8') as file:
    for text in texts:
        file.write(text + '\n')


print('Delete texts variable')
del texts
gc.collect()


In [None]:
import fasttext
import os
import gc
from multiprocessing import cpu_count

print('Start fasttext model training')
model = fasttext.train_unsupervised(temp_file_name, thread=cpu_count())


# Close and remove the temporary file
# temp_file.close()
# os.remove(temp_file_name)

# Save the trained model
model_name = "fasttext_unsupervised_skipgram_dim100_en"
print(f'Save the trained model to models/{model_name}.bin')
model.save_model(f"models/{model_name}.bin")


print('Delete model variable')
del model
gc.collect()