In [None]:
import pandas as pd
from datasets import load_dataset
from torch import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np
import google.generativeai as genai
from tqdm import tqdm
from IPython.display import display, Markdown

np.seterr(all="ignore")

In [None]:
from datasets import load_dataset

# Load the first 10,000 rows of the dataset
df = load_dataset("corbt/all-recipes", split="train[:100]")

# Convert the dataset to a Pandas DataFrame
df = df.to_pandas()

# Preview the first 5 rows
print(df.head())

# Extract titles from the 'input' column for the first 10,000 entries
df["titles"] = df["input"].str.split("Ing").str[0]

# View the DataFrame with titles
print(df[["titles", "input"]].head())

In [110]:
def preprocess_recipe_to_qa(recipe_text):
    # Split the text into ingredients and directions based on "Ingredients:" and "Directions:"
    split_text = recipe_text.split("Directions:")
    ingredients = split_text[0].replace("Ingredients:", "").strip()
    directions = split_text[1].strip() if len(split_text) > 1 else ""
    recipe_name = recipe_text.split("Ingredients:")[0].strip()

    # Create a list of Q&A pairs
    qa_pairs = [
        ("What is the name of the recipe?", recipe_name),
        (f"What are the ingredients for {recipe_name} ?", ingredients),
        (f"What are the directions for {recipe_name} ?", directions),
    ]

    return qa_pairs

In [111]:
pd.set_option("display.max_colwidth", None)
pd.set_option("display.max_rows", None)

In [None]:
df["qa_pairs"] = df["input"].apply(preprocess_recipe_to_qa)
# df.drop(columns=["input"], inplace=True)
print(df.head(1))

In [113]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


def get_embeddings(text):
    # Tokenize the input text and move it to the GPU
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(
        device
    )
    with torch.no_grad():
        outputs = model(**inputs)
    # Get the embeddings (usually the last hidden state)
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

In [None]:
# Generate embeddings for each Q&A pair
qa_embeddings = []
for qa_pair_list in tqdm(df["qa_pairs"], desc="Generating Q&A Embeddings"):
    # Concatenate question and answer into a single string for each pair and get embeddings
    # if df.index.get_loc(qa_pair_list.name) == 0:
    qa_embedding_list = []
    for question, answer in qa_pair_list:
        qa_text = f"Question: {question} Answer: {answer}"
        embedding = get_embeddings(qa_text)
        qa_embedding_list.append(embedding)
    qa_embeddings.append(qa_embedding_list)

In [None]:
# Assuming df is already created and has the 'titles' column
# Generate embeddings for the titles with a progress bar using GPU acceleration
titles_list = df["titles"].tolist()  # Extract titles from the DataFrame
embeddings = [
    get_embeddings(title)
    for title in tqdm(titles_list, desc="Generating Title Embeddings")
]

# Create a DataFrame to ingest it to the database
embeddings_df = pd.DataFrame(
    {
        "input": df["qa_pairs"],
        "titles": titles_list,
        "qa_pairs": df["qa_pairs"],
        "embeddings": embeddings,
    }
)

# Display the embeddings DataFrame
print(embeddings_df.head())

In [116]:
genai.configure(api_key="YOUR_API_KEY")


def get_relevant_docs(user_query, dataframe, top_n=3):
    query_embeddings = np.array(get_embeddings(user_query))

    def cosine_similarity(embedding):
        return float(
            np.dot(query_embeddings, embedding)
            / (np.linalg.norm(query_embeddings) * np.linalg.norm(embedding))
        )

    embeddings_df["similarity"] = embeddings_df["embeddings"].apply(
        lambda x: cosine_similarity(np.array(x)[0])
    )

    relevant_docs = embeddings_df.nlargest(top_n, "similarity")["input"].tolist()
    print(relevant_docs)
    sorted_embeddings_df = embeddings_df.sort_values(by="similarity", ascending=False)

    return relevant_docs

In [124]:
def make_rag_prompt(query, relevant_passage):
    # Ensure all elements in relevant_passage are strings before joining
    relevant_passage = " ".join([str(passage) for passage in relevant_passage])
    prompt = (
        f"You are a helpful and informative recipe chatbot that answers questions using text from the reference passage included below.\n\n "
        f"Add some extra information to make your response more helpful and engaging. \n\n"
        f"only anwer the questions with the topic of the recipes,ingredients, directions and cooking methods.\n\n "
        f"Maintain a friendly and conversational tone. If the passage is irrelevant, feel free to ignore it.\n\n"
        f"Give the answer in a markdown format.\n\n"
        f"If the answer contains Ingrediens, give them in a unordered list with a title format.\n\n"
        f"QUESTION: '{query}'\n"
        f"PASSAGE: '{relevant_passage}'\n\n"
        f"ANSWER:"
    )
    return prompt


def generate_response(user_prompt):
    model = genai.GenerativeModel("gemini-1.5-flash")
    answer = model.generate_content(user_prompt)
    return answer.text


def generate_answer(query):
    relevant_text = get_relevant_docs(query, embeddings_df)
    prompt = make_rag_prompt(query, relevant_passage=relevant_text)
    answer = generate_response(prompt)

    return answer

In [None]:
answer = generate_answer("Can you tell me how to buy a car?")
display(Markdown(answer))