In [1]:
import pandas as pd
import random
import numpy as np
import torch
import transformers
import sentence_transformers
from transformers import AutoModel, AutoTokenizer
from transformer_helper import get_token_counts, get_word_counts, get_embeddings
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
def create_sample_dataframe(n_samples, id_qis, sentences_per_id_qi):
    # Ensure the inputs are valid
    if not isinstance(n_samples, int) or n_samples <= 0:
        raise ValueError("n_samples must be a positive integer.")
    if not isinstance(id_qis, list) or not id_qis:
        raise ValueError("id_qis must be a non-empty list.")
    if not isinstance(sentences_per_id_qi, dict) or not sentences_per_id_qi:
        raise ValueError("sentences_per_id_qi must be a non-empty dictionary.")
    if not all(k in sentences_per_id_qi for k in id_qis):
        raise ValueError("All id_qis must have corresponding sentences in sentences_per_id_qi.")
    sampled_id_qis = random.choices(id_qis, k=n_samples)
    sampled_sentences = [
        random.choice(sentences_per_id_qi[id_qi]) for id_qi in sampled_id_qis
    ]
    df = pd.DataFrame({
        'StoryBody': sampled_sentences,
        'id_qis': sampled_id_qis
    })
    return df

def select_first_n_occurences(df: pd.DataFrame, col='StoryBody', n=7):
    df["rpr"] = np.arange(len(df))
    result_df = df.groupby(col).head(n)
    return result_df.drop('rpr', axis=1), result_df["rpr"]

In [3]:
np.random.seed(30)
random.seed(30)
col = 'StoryBody'
id_qis = ["id1", "id2", "id3"]
sentences_per_id_qi = {
    "id1": ["This stock is overrated.", "The CEO is completely crazy.", "What a surprising deal."],
    "id2": ["I would like to buy their items.", "I love this firm.", "This is quite rare."],
    "id3": ["This has so much potential.", "I doubt they will make it.", "This new journey seems interesting."]
}
n_samples = 20
df = create_sample_dataframe(n_samples, id_qis, sentences_per_id_qi)
print(f"{df.shape[0]} rows")
display(df)
df, _ = select_first_n_occurences(df, n=2)
print(f"{df.shape[0]} rows")
display(df)

20 rows


Unnamed: 0,StoryBody,id_qis
0,This is quite rare.,id2
1,This stock is overrated.,id1
2,The CEO is completely crazy.,id1
3,I would like to buy their items.,id2
4,This stock is overrated.,id1
5,What a surprising deal.,id1
6,This is quite rare.,id2
7,I love this firm.,id2
8,I doubt they will make it.,id3
9,This is quite rare.,id2


16 rows


Unnamed: 0,StoryBody,id_qis
0,This is quite rare.,id2
1,This stock is overrated.,id1
2,The CEO is completely crazy.,id1
3,I would like to buy their items.,id2
4,This stock is overrated.,id1
5,What a surprising deal.,id1
6,This is quite rare.,id2
7,I love this firm.,id2
8,I doubt they will make it.,id3
10,This has so much potential.,id3


In [4]:
model_path = 'Alibaba-NLP/gte-base-en-v1.5'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to('cuda')

In [18]:
token_counts = {}
embeddings = {}
cctnd_contents = {}
for id_qi in df["id_qis"].unique():
    df_id_qi = df[df["id_qis"]==id_qi]
    cctnd_content = 1000 * " ".join(df_id_qi[col].to_list())
    token_counts[id_qi] = get_token_counts([cctnd_content], tokenizer)
    embeddings[id_qi] = get_embeddings([cctnd_content], model, tokenizer, device).cpu()
    print(f"{token_counts[id_qi]} {cctnd_content}")
    cctnd_contents[id_qi] = cctnd_content

[31002] This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is quite rare. I would like to buy their items. This is quite rare. I love this firm. I would like to buy their items.This is 

In [13]:
cctnd_contents_li = list(cctnd_contents.values())
e_li = get_embeddings(cctnd_contents_li, model, tokenizer, device)

In [19]:
np.vstack(list(embeddings.values()))
e_li.shape

torch.Size([3, 768])

In [20]:
np.vstack(list(embeddings.values()))

array([[-0.0147725 , -0.04573196,  0.03839938, ..., -0.00955039,
         0.0063269 ,  0.00166839],
       [-0.00395129,  0.00766447, -0.00290058, ..., -0.03306467,
        -0.0170429 ,  0.00706779],
       [-0.01432395, -0.04885207, -0.04992025, ...,  0.00134706,
         0.00494507,  0.02857619]], dtype=float32)

In [24]:
(np.array(e_li.cpu()) == np.vstack(list(embeddings.values()))).all()

True