In [None]:
from tqdm import tqdm
import glob
import os
import sys
import argparse
import numpy as np
from sentence_transformers import SentenceTransformer

from utils import (
    SBERT_NAME,
    count_nb_files,
    preprocess,
    existing_dir_path,
    create_dir,
    load_embeddings,
    DEFAULT_SAVE_SIZE,
    EMB_DIMENSION
)


batch_size = 10
class Args:
    def __init__(self, input_path, output_folder):
        self.input_path = input_path
        self.output_folder = output_folder
        self.save_size = 100

In [None]:
args = Args(
    "medialab_data/tweets_from_deputesXVI_220617-230717", 
    "data_prod/embeddings/deputes")

In [None]:
docs = np.array(
    [doc for doc in preprocess(args.input_path, count_nb_files(args.input_path), apply_unidecode=True)]
)

In [None]:
embeddings = np.zeros((len(docs), EMB_DIMENSION))
max_index = 0

In [None]:
embedding_model = SentenceTransformer(SBERT_NAME)
SAVE_PATH = os.path.join(args.output_folder, "tweets_sentence-camembert-large.npz")

In [None]:
def format_output(size):
    return SAVE_PATH.replace(".npz", "_" + str(size) + ".npz")

In [None]:
for i in tqdm(
    range(max_index, len(docs), batch_size),
    desc="Encode sentences using CamemBERT large",
):
    if i % args.save_size == 0 and i > 0:
        np.savez_compressed(
            format_output(i),
            embeddings=embeddings[i - args.save_size : i],
        )
    embeddings[i : min(len(docs), i + batch_size)] = embedding_model.encode(
        docs[i : i + batch_size]
    )

np.savez_compressed(
    format_output(len(docs)),
    embeddings=embeddings[len(docs) - (len(docs) % args.save_size) :],
)