# Generate Embeddings

In [1]:
# Load corpus json
import json

print('Load corpus.json')
with open('actual_data/corpus.json/corpus.json', 'r') as f:
    documents = json.load(f)

Load corpus.json


In [2]:
# OPTIONAL: only select a random subset of documents from corpus
import random
import gc

random.seed(42)
num_documents_to_select = 50000

selected_documents = random.sample(documents, num_documents_to_select)

# Print some examples to check
for i, doc in enumerate(selected_documents[:5]):
    print(f"Document {i+1}: {doc['docid']} - {doc['text'][:100]}...")

# Extract docids from the selected documents
selected_docids = [doc['docid'] for doc in selected_documents]

# Write the selected docids to a file
docids_file_path = 'selected_docids.json'
with open(docids_file_path, 'w') as f:
    json.dump(selected_docids, f)

print(f"Selected docids written to {docids_file_path}")

# Replace documents variable and clean memory
del documents
documents = selected_documents
del selected_documents
del selected_docids
gc.collect()

Document 1: doc-en-602905 - The Romance of the Forest is a Gothic novel by Ann Radcliffe that was first published in 1791. It co...
Document 2: doc-en-776092 - David Lee Roth (born October 10, 1954) is an American retired rock musician, singer, songwriter and ...
Document 3: doc-en-742232 - Daniel Dewan Sewell (born March 16, 1981), known professionally as Danny Brown, is an American rappe...
Document 4: doc-en-140998 - Electoral district no. 7 () is one of the 12 multi-member electoral districts of the Riigikogu, the ...
Document 5: doc-en-307726 - EcosimPro is a simulation tool developed by Empresarios Agrupados A.I.E for modelling simple and com...
Selected docids written to selected_docids.json


32

In [3]:
# Extract text from docs
from preprocessing import clean_text
print('Extract text from docs')
texts = [doc['text'] for doc in documents] 

del documents
gc.collect()


# Save docs in temp file
import tempfile
print('Save docs in temp file')
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
    temp_file_name = temp_file.name
    for text in texts:
        temp_file.write((text + '\n').encode('utf-8'))


print('Delete texts variable')
del texts
gc.collect()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vincentfiszbin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vincentfiszbin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


어떤 방식으로 췌몽상과 비상천에서 빗자루를 타고 돌진하는 형태로 사용되는 것인가요
Extract text from docs
Save docs in temp file
Delete texts variable


0

In [4]:
import fasttext
import os
import gc
from multiprocessing import cpu_count

print('Start fasttext model training')
model = fasttext.train_unsupervised(temp_file_name, model = 'cbow', thread=cpu_count())


# Close and remove the temporary file
temp_file.close()
os.remove(temp_file_name)

# Save the trained model
model_name = "fasttext_unsupervised_cbow_dim100_mini"
print(f'Save the trained model to models/{model_name}.bin')
model.save_model(f"models/{model_name}.bin")


print('Delete model variable')
del model
gc.collect()

Start fasttext model training


Read 100M words
Number of words:  657722
Number of labels: 0
Progress: 100.0% words/sec/thread:  112554 lr:  0.000000 avg.loss:  0.777781 ETA:   0h 0m 0s 41.1% words/sec/thread:  113381 lr:  0.029464 avg.loss:  1.012606 ETA:   0h 5m26s 112807 lr:  0.019040 avg.loss:  0.930815 ETA:   0h 3m32s 69.9% words/sec/thread:  112706 lr:  0.015071 avg.loss:  0.905647 ETA:   0h 2m48s


Save the trained model to models/fasttext_unsupervised_cbow_dim100_mini.bin
Delete model variable


0

# MLflow logging

In [None]:
import mlflow

try:
    model_name
except NameError:
    model_name = "fasttext_unsupervised_cbow_dim100"

# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")

# Create a new MLflow Experiment
mlflow.set_experiment("word_embedding")

# Start an MLflow run
with mlflow.start_run(run_name=model_name):
    mlflow.log_param("framework", "fasttext")
    mlflow.log_param("method", "unsupervised")
    mlflow.log_param("model", "cbow")
    mlflow.log_param("input", "50000docs")

    mlflow.log_metric("dimension", 100)
    mlflow.log_metric("recall_at10_dev", 0.0967)