## Step 1. Upload dataset and generate embeddings

In [None]:
!pip install datasets


In [None]:
from datasets import load_dataset
import pandas as pd
import ast

dataset = load_dataset("rogozinushka/povarenok-recipes", download_mode="force_redownload")
df = pd.DataFrame(dataset['train'])
def extract_keys(dict_str):
    try:
        ingredients = ast.literal_eval(dict_str)
        keys = [key.lower() for key in ingredients.keys()]
        return keys
    except:
        return []
df['ingredient_names'] = df['ingredients'].apply(extract_keys)
df = df.drop(columns=['ingredients'])

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained RuBERT model
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

def get_rubert_embedding(word_list, max_length=128):
    inputs = tokenizer(word_list, return_tensors='pt', is_split_into_words=True, padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().detach().numpy()
    return cls_embedding

df['ingredient_embedding'] = df['ingredient_names'].progress_apply(lambda x: get_rubert_embedding(x))

df.to_csv('recipes_with_embeddings.csv', index=False)

## Step 2. Slice the 50k sample from original dataset

In [None]:
print(type(df['ingredient_embedding'][0]))

Convert embeddings to arrays

In [None]:
import numpy as np
import re
def convert_embedding(embedding_str):
    cleaned_str = re.sub(r'\s+', ' ', embedding_str.replace('[', '').replace(']', '').strip())
    embedding_array = np.fromstring(cleaned_str, sep=' ')
    return embedding_array

df['ingredient_embedding'] = df['ingredient_embedding'].apply(convert_embedding)
print(df['ingredient_embedding'].head())

Use stratified sampling to preserve variance

In [None]:
df['embedding_mean'] = df['ingredient_embedding'].apply(np.mean)
df['embedding_bin'] = pd.qcut(df['embedding_mean'], q=50, duplicates='drop')

sampling_fraction = 50000 / len(df)
sample = df.groupby('embedding_bin', group_keys=False).apply(lambda x: x.sample(frac=sampling_fraction, random_state=42))

print(sample.head())
print(sample.shape)

In [None]:
original_variances = np.var(np.vstack(df['ingredient_embedding']), axis=0)
sampled_variances = np.var(np.vstack(sample['ingredient_embedding']), axis=0)
mean_original_variances = np.mean(original_variances)
mean_sampled_variances = np.mean(sampled_variances)

print("Mean variance of the original dataset:", mean_original_variances)
print("Mean variance of the sampled dataset:", mean_sampled_variances)

sample = sample.drop(['embedding_mean', 'embedding_bin'], axis = 1)
sample.to_csv('sampled_df.csv', index=False)

## Step 3. Shrink the embeddings

In [None]:
embedding_size = len(sample['ingredient_embedding'].iloc[0])
print(embedding_size)

Perform PCA analysis to find the optimal dimensions

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
pooled_embeddings = np.stack(sample['ingredient_embedding'].values)

pca = PCA(n_components=None)
pca.fit(pooled_embeddings)

eigen_vals = pca.explained_variance_

plt.style.use("ggplot")
plt.plot(eigen_vals, marker='o', color='blue')
plt.xlabel("Eigenvalue number")
plt.ylabel("Eigenvalue size")
plt.title("Scree Plot")
plt.show()

In [None]:
exp_var = pca.explained_variance_ratio_  * 100
cum_exp_var = np.cumsum(exp_var)

a = 769

plt.bar(range(1, a), exp_var, align='center',
        label='Individual explained variance')

plt.step(range(1, a), cum_exp_var, where='mid',
         label='Cumulative explained variance', color='red')

plt.ylabel('Explained variance percentage')
plt.xlabel('Principal component index')
plt.xticks(ticks=list(range(1, a)))
plt.legend(loc='best')
plt.tight_layout()

nc = np.argmax(cum_exp_var >= 85) + 1
print(f"Dimensions to keep 85% variance: {nc}")

Shrink embeddings to 70 dimensions per vector

In [None]:
import joblib
pca = PCA(n_components=70)
reduced_embeddings = pca.fit_transform(pooled_embeddings)

joblib_file = "pca_model.pkl"
joblib.dump(pca, joblib_file)

df_reduced = sample.drop(columns=['ingredient_embedding']).copy()
df_reduced['ingredient_embedding'] = list(reduced_embeddings)

df_reduced.to_csv('reduced_sample.csv', index=False)

In [None]:
embedding_size = len(df_reduced['ingredient_embedding'].iloc[0])
print(embedding_size)

## Step 4. Upload to Qdrant database

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import models

api_key = 'hidden'
url = 'hidden'
client = QdrantClient(api_key=api_key, url=url)

In [None]:
collection_name = "recipes_reduced"

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(size=70, distance=models.Distance.COSINE)
)

vectors = df_reduced['ingredient_embedding'].tolist()
payload = df_reduced[['url', 'name', 'ingredient_names']].to_dict('records')

client.upload_collection(
    collection_name=collection_name,
    vectors=vectors,
    payload=payload,
    ids=None
)

## Step 5. Test search process

From previous parts we have 2 lists of words, which we want to embed and use as search vectors in database

In [None]:
import torch
from sklearn.decomposition import PCA
import numpy as np
from sklearn.decomposition import TruncatedSVD
import joblib

tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")


raw_prediction = ['лимон', 'куриное филе', 'кефир']
raw_synonyms = ['апельсин', 'гусь', 'йогурт']

def get_rubert_embedding(word_list, max_length=128):
    inputs = tokenizer(' '.join(word_list), return_tensors='pt', padding=True, truncation=True, max_length=max_length)
    outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().detach().numpy()
    return cls_embedding

prediction_embedding = get_rubert_embedding(raw_prediction)
synonyms_embedding = get_rubert_embedding(raw_synonyms)
prediction_embedding = prediction_embedding.reshape(1, -1)
synonyms_embedding = synonyms_embedding.reshape(1, -1)
pca = joblib.load("pca_model.pkl")

prediction_embedding_70d = pca.transform(prediction_embedding)
synonyms_embedding_70d = pca.transform(synonyms_embedding)

prediction_embedding_70d = prediction_embedding_70d.reshape(70)
synonyms_embedding_70d = synonyms_embedding_70d.reshape(70)

FInd and display top-5 closest matches

In [None]:
def perform_search(collection_name, query_vector, limit=5):
    response = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=limit,
        with_payload=True
    )
    return response

collection_name_70 = "recipes_reduced"
print("INITIAL", raw_prediction)

search_results_70 = perform_search(collection_name_70, prediction_embedding_70d )
print("\nSearch results for INITIAL :")
for result in search_results_70:
    print(result.payload)

print("\n\nSYNONYMS", raw_synonyms)
search_results_70 = perform_search(collection_name_70, synonyms_embedding_70d)
print("\nSearch results for SYNONYMS :")
for result in search_results_70:
    print(result.payload)
