In [None]:
!pip install -U weaviate-client

In [None]:
import pandas as pd
import numpy as np
import json

df = pd.read_csv("/kaggle/input/recipe-dataset-over-2m/recipes_data.csv")
df

# Cleaning

We do not need **link**, **site**, or **source** columns for our analysis, and recommendations.

In [None]:
df = df.drop(columns=['link', 'site', 'source', 'NER'], axis=1)
df['title'] = df['title'].str.title()
df['ingredients'] = df['ingredients'].map(lambda item : json.loads(item))
df['directions'] = df['directions'].map(lambda item : json.loads(item))
df

In [None]:
print("Before dropping", df.shape[0])
# drop rows with null title
df = df.dropna(subset=['title'])

# drop rows with no ingredients
df = df[df['ingredients'].apply(lambda x: len(x) > 0)]

# drop rows with no steps
df = df[df['directions'].apply(lambda x: len(x) > 0)]

print("After dropping", df.shape[0])

In [None]:

df['ingredients'] = df['ingredients'].apply(tuple)
df['directions'] =  df['directions'].apply(tuple)

# remove all duplicate rows based on title
df = df.drop_duplicates(subset=['title'])

# remove all duplicate rows based on (ingredients, directions) together
df = df.drop_duplicates(subset=['ingredients', 'directions'])
df.nunique()

The directions are concatenated with a new line, and that attribute will be used for vectorization.

In [None]:
df['directions_text'] = df['directions'].apply(lambda item : '\n'.join(item))

Only 1000 random rows are sampled for uploading to weaviate for demonstration purposes, and computational constraints.

In [None]:
sampled_df = df.sample(n=1000, random_state=42)

sampled_df

# Connect to Weaviate Cluster

In [None]:
import weaviate
import os

# kaggle's secret managing module
from kaggle_secrets import UserSecretsClient
import weaviate.classes as wvc

user_secrets = UserSecretsClient()

with weaviate.connect_to_wcs(
    cluster_url=user_secrets.get_secret("WCS_URL"),  # Replace with your Weaviate Cloud URL
    auth_credentials=weaviate.auth.AuthApiKey(user_secrets.get_secret("WCS_API_KEY")),  # Replace with your Weaviate Cloud key
    headers={"X-Cohere-Api-Key": user_secrets.get_secret("COHERE_APIKEY")}
) as client:  # Use this context manager to ensure the connection is closed
    print(client.is_ready())

# Creating the Recipe Collection
We are specifying the Cohere text2vec module on the 'directions_text' property of a recipe. 

In [None]:
from weaviate.classes.config import Configure

client.connect()
recipes = client.collections.create(
        "Recipe",
        vectorizer_config=[
            Configure.NamedVectors.text2vec_cohere(
                name="directions_vector",
                source_properties=["directions_text"],
            )
        ],
)

# Preparing data to upload to weaviate

In [None]:
import weaviate.classes as wvc

recipe_objs = list()
for i, d in sampled_df.iterrows():
    recipe_objs.append(wvc.data.DataObject(
        properties={
            "title": d["title"],
            "ingredients": d["ingredients"],
            "directions": d["directions"],
            "directions_text" : d["directions_text"]
        },
    ))

In [None]:
client.connect()
Recipe = client.collections.get("Recipe")
Recipe.data.insert_many(recipe_objs)