In [7]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [3]:
# Load the DataFrame containing instruction and response columns
df = pd.read_csv('../data/processed/inst-resp.csv')

In [24]:
def keyword_search(user_input, df, threshold=0.7):
    """
    Search for a recipe response based on user input.

    Args:
        user_input (str): The user's input.
        df (DataFrame): The DataFrame containing recipes.
        threshold (float): The minimum match score required. Default is 0.7.

    Returns:
        str or None: The response text of the matching recipe, or None if no match is found.
    """
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Get the instruction and response from the row
        instruction = row['Instruction']
        response = row['Response']
        
        # Check if instruction and response are not null
        if pd.notnull(instruction) and pd.notnull(response):
            # Calculate match score using TF-IDF cosine similarity
            match_score = calculate_similarity(user_input, instruction) + calculate_similarity(user_input, response)
            
            # Check if match score meets the threshold
            if match_score >= threshold:
                return response  # Return the response if threshold is met
    
    return None  # Return None if no match is found

def calculate_similarity(input_text, target_text):
    """
    Calculate the cosine similarity between two text strings using TF-IDF.

    Args:
        input_text (str): The first text string.
        target_text (str): The second text string.

    Returns:
        float: The cosine similarity score between the two text strings.
    """
    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer()
    
    # Fit and transform the vectorizer on the input and target text
    tfidf_matrix = tfidf_vectorizer.fit_transform([input_text, target_text])
    
    # Calculate cosine similarity
    similarity_matrix = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    
    return similarity_matrix[0][0]  # Return the similarity score

In [25]:
test_df = pd.read_csv('../data/processed/test_data.csv')
test_df.head(2)

Unnamed: 0,Instruction,Response
0,"Tags: ['60-minutes-or-less', 'time-to-make', '...",Name: french toast with a crunchy topping Minu...
1,"Tags: ['weeknight', 'time-to-make', 'course', ...",Name: almost authentic cincinnati chili Minute...


In [28]:

generated_texts_tfidf = []
for prompt in tqdm(test_df['Instruction'].to_list(), desc="Generating Recipes"):
    # Perform keyword search
    relevant_recipes = keyword_search(prompt, df)
    # Output relevant recipes
    if relevant_recipes:
        generated_texts_tfidf.append(relevant_recipes)
    else:
        print("No relevant recipes found.")
        generated_texts_tfidf.append("")
    # generated_texts_pre.append(generated_text)

Generating Recipes:   0%|          | 0/100 [00:00<?, ?it/s]

Generating Recipes: 100%|██████████| 100/100 [00:50<00:00,  2.00it/s]


In [29]:
test_df['TF_IDF_GeneratedRecipe'] = generated_texts_tfidf

In [44]:
test_df.head(2)

Unnamed: 0,Instruction,Response,TF_IDF_GeneratedRecipe
0,"Tags: ['60-minutes-or-less', 'time-to-make', '...",Name: french toast with a crunchy topping Minu...,Name: apple a day milk shake Minutes: 0 Ingre...
1,"Tags: ['weeknight', 'time-to-make', 'course', ...",Name: almost authentic cincinnati chili Minute...,Name: forgotten minestrone Minutes: 495 Ingre...


In [45]:
test_df.to_csv('../data/processed/test_data.csv')

Updated the test_data csv to do inference