<a href="https://colab.research.google.com/github/milanimcgraw/Nutrition-Facts-Chat-Assistant/blob/main/nutritionfactschatassistant_csv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Nutrition Facts Chat Assistant (CSV csv content)**

>The Nutrition Facts Chat Assistant is an advanced, AI-powered application that leverages Retrieval-Augmented Generation (RAG) to answer questions and provide accurate and contextual nutritional information. This project demonstrates the practical application of cutting-edge natural language processing techniques in the field of nutrition and dietary information. The USDA FoodData Central dataset was used for comprehensive nutritional information.

## Install dependencies

In [None]:
!pip install pandas numpy sentence-transformers faiss-cpu openai scikit-learn requests plotly ipywidgets streamlit



## Load libraries and setup

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
import openai
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display, clear_output
import os
import requests
from io import StringIO
import plotly.express as px
from datetime import datetime

# Load the OpenAI API key from the environment variables
openai.api_key = os.environ.get('OPENAI_API_KEY')

#Data Ingestion

In [None]:
# Base URL for the GitHub repository where CSV files are hosted
base_url = 'https://raw.githubusercontent.com/milanimcgraw/Nutrition-Facts-Chat-Assistant/main/FoodData_Central_foundation_food_csv_2024-04-18'

# List of all CSV file names in the GitHub folder
csv_files = [
    'acquisition_samples.csv',
    'agricultural_samples.csv',
    'food_attribute_type.csv',
    'food_attribute.csv',
    'food_calorie_conversion_factor.csv',
    'food_category.csv',
    'food_component.csv',
    'food_nutrient_conversion_factor.csv',
    'food_nutrient.csv',
    'food_portion.csv',
    'food_protein_conversion_factor.csv',
    'food_update_log_entry.csv',
    'food.csv',
    'foundation_food.csv',
    'input_food.csv',
    'lab_method_code.csv',
    'lab_method_nutrient.csv',
    'lab_method.csv',
    'market_acquisition.csv',
    'measure_unit.csv',
    'nutrient.csv',
    'sample_food.csv',
    'sub_sample_food.csv',
    'sub_sample_result.csv'
]

# Initialize a dictionary to hold DataFrames
dataframes = {}

# Load each CSV file into a DataFrame
for file in csv_files:
    file_url = f"{base_url}/{file}"  # Use the raw content URL
    response = requests.get(file_url)
    if response.status_code == 200:
        csv_content = StringIO(response.text)
        # Load CSV data into DataFrame
        df = pd.read_csv(csv_content)
        # Add a 'source_file' column to keep track of the source file
        df['source_file'] = file
        dataframes[file] = df
    else:
        print(f"Failed to fetch {file_url}")

# Now you can check the columns of each DataFrame
for file_name, df in dataframes.items():
    print(f"Columns in {file_name}: {df.columns.tolist()}")

# Example of filling missing values and creating a combined text column
combined_df = pd.DataFrame()  # Initialize an empty DataFrame

for file_name, df in dataframes.items():
    df.fillna('Unknown', inplace=True)  # Fill missing values

    # Example of constructing a 'text' column dynamically
    df['text'] = df.apply(lambda row: ' '.join([
        f"{col}: {row[col]}" for col in df.columns if pd.notna(row[col])
    ]), axis=1)

    # Concatenate DataFrames
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Save the preprocessed data
combined_df.to_csv("preprocessed_nutrition_data.csv", index=False)

print("Combined DataFrame shape:", combined_df.shape)
print("Sample data from combined DataFrame:")
print(combined_df.head())

  df = pd.read_csv(csv_content)


Columns in acquisition_samples.csv: ['fdc_id_of_sample_food', 'fdc_id_of_acquisition_food', 'source_file']
Columns in agricultural_samples.csv: ['fdc_id', 'acquisition_date', 'market_class', 'treatment', 'state', 'source_file']
Columns in food_attribute_type.csv: ['id', 'name', 'description', 'source_file']
Columns in food_attribute.csv: ['id', 'fdc_id', 'seq_num', 'food_attribute_type_id', 'name', 'value', 'source_file']
Columns in food_calorie_conversion_factor.csv: ['food_nutrient_conversion_factor_id', 'protein_value', 'fat_value', 'carbohydrate_value', 'source_file']
Columns in food_category.csv: ['id', 'code', 'description', 'source_file']
Columns in food_component.csv: ['id', 'fdc_id', 'name', 'pct_weight', 'is_refuse', 'gram_weight', 'data_points', 'min_year_acqured', 'source_file']
Columns in food_nutrient_conversion_factor.csv: ['id', 'fdc_id', 'source_file']
Columns in food_nutrient.csv: ['id', 'fdc_id', 'nutrient_id', 'amount', 'data_points', 'derivation_id', 'min', 'max', 

  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values
  df.fillna('Unknown', inplace=True)  # Fill missing values


Combined DataFrame shape: (494100, 67)
Sample data from combined DataFrame:
   fdc_id_of_sample_food  fdc_id_of_acquisition_food              source_file  \
0               319874.0                    319876.0  acquisition_samples.csv   
1               319874.0                    319875.0  acquisition_samples.csv   
2               319879.0                    319881.0  acquisition_samples.csv   
3               319879.0                    319880.0  acquisition_samples.csv   
4               319885.0                    319890.0  acquisition_samples.csv   

                                                text fdc_id acquisition_date  \
0  fdc_id_of_sample_food: 319874 fdc_id_of_acquis...    NaN              NaN   
1  fdc_id_of_sample_food: 319874 fdc_id_of_acquis...    NaN              NaN   
2  fdc_id_of_sample_food: 319879 fdc_id_of_acquis...    NaN              NaN   
3  fdc_id_of_sample_food: 319879 fdc_id_of_acquis...    NaN              NaN   
4  fdc_id_of_sample_food: 319885 fdc_

## Prcoessing & Sampling

In [None]:
# Inspection: Data from each source file
print("\nData from each source file:")
for file in combined_df['source_file'].unique():
    print(f"\nSample from {file}:")
    print(combined_df[combined_df['source_file'] == file].head())


Data from each source file:

Sample from acquisition_samples.csv:
   fdc_id_of_sample_food  fdc_id_of_acquisition_food              source_file  \
0               319874.0                    319876.0  acquisition_samples.csv   
1               319874.0                    319875.0  acquisition_samples.csv   
2               319879.0                    319881.0  acquisition_samples.csv   
3               319879.0                    319880.0  acquisition_samples.csv   
4               319885.0                    319890.0  acquisition_samples.csv   

                                                text fdc_id acquisition_date  \
0  fdc_id_of_sample_food: 319874 fdc_id_of_acquis...    NaN              NaN   
1  fdc_id_of_sample_food: 319874 fdc_id_of_acquis...    NaN              NaN   
2  fdc_id_of_sample_food: 319879 fdc_id_of_acquis...    NaN              NaN   
3  fdc_id_of_sample_food: 319879 fdc_id_of_acquis...    NaN              NaN   
4  fdc_id_of_sample_food: 319885 fdc_id_of_acq

In [None]:
# Combined DataFrame information
print("Combined DataFrame Info:")
print(combined_df.info())
print(f"\nShape of DataFrame: {combined_df.shape}")
print(f"\nColumns in DataFrame: {combined_df.columns.tolist()}")

Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494100 entries, 0 to 494099
Data columns (total 67 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   fdc_id_of_sample_food               60280 non-null   float64
 1   fdc_id_of_acquisition_food          7593 non-null    float64
 2   source_file                         494100 non-null  object 
 3   text                                494100 non-null  object 
 4   fdc_id                              308372 non-null  object 
 5   acquisition_date                    7619 non-null    object 
 6   market_class                        810 non-null     object 
 7   treatment                           810 non-null     object 
 8   state                               810 non-null     object 
 9   id                                  245380 non-null  float64
 10  name                                9295 non-null    object 
 11  d

In [None]:
# Row count from each source file
print("\nRow count from each source file:")
print(combined_df['source_file'].value_counts())


Row count from each source file:
source_file
food_nutrient.csv                      144484
sub_sample_result.csv                  111899
food_update_log_entry.csv               63975
food.csv                                63975
sub_sample_food.csv                     52687
food_nutrient_conversion_factor.csv     11370
food_portion.csv                        10678
acquisition_samples.csv                  7593
market_acquisition.csv                   6809
food_attribute.csv                       5625
input_food.csv                           5270
sample_food.csv                          3311
food_component.csv                       3066
agricultural_samples.csv                  810
lab_method_nutrient.csv                   560
nutrient.csv                              477
food_calorie_conversion_factor.csv        312
food_protein_conversion_factor.csv        288
foundation_food.csv                       287
lab_method.csv                            280
lab_method_code.csv               

In [None]:
# Handling missing values (newly revised code)
missing_values = combined_df.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

Missing values in each column:
fdc_id_of_sample_food         433820
fdc_id_of_acquisition_food    486507
source_file                        0
text                               0
fdc_id                        185728
                               ...  
nutrient_nbr                  493623
rank                          493623
food_nutrient_id              382201
adjusted_amount               382201
nutrient_name                 382201
Length: 67, dtype: int64


In [None]:
# Display the shape and a sample of the combined DataFrame
print(f"Combined DataFrame Shape: {combined_df.shape}")
combined_df.sample(5)

Combined DataFrame Shape: (494100, 67)


Unnamed: 0,fdc_id_of_sample_food,fdc_id_of_acquisition_food,source_file,text,fdc_id,acquisition_date,market_class,treatment,state,id,...,store_city,store_name,store_state,upc_code,unit_name,nutrient_nbr,rank,food_nutrient_id,adjusted_amount,nutrient_name
96636,,,food_nutrient.csv,id: 2285173 fdc_id: 335905 nutrient_id: 1009 a...,335905.0,,,,,2285173.0,...,,,,,,,,,,
390376,,,sub_sample_result.csv,food_nutrient_id: 2226264 adjusted_amount: 0.1...,,,,,,,...,,,,,,,,2226264.0,0.16,Tyrosine
356573,2257056.0,,sub_sample_food.csv,fdc_id: 2257796 fdc_id_of_sample_food: 2257056...,2257796.0,,,,,,...,,,,,,,,,,
251676,,,food.csv,fdc_id: 324149 data_type: sub_sample_food desc...,324149.0,,,,,,...,,,,,,,,,,
473467,,,sub_sample_result.csv,food_nutrient_id: 27794949 adjusted_amount: 0....,,,,,,,...,,,,,,,,27794949.0,0.28,Methionine


In [None]:
# Fill missing values
combined_df.fillna('Unknown', inplace=True)

  combined_df.fillna('Unknown', inplace=True)


In [None]:
# Sample of the cleaned data
combined_df.sample(5)

Unnamed: 0,fdc_id_of_sample_food,fdc_id_of_acquisition_food,source_file,text,fdc_id,acquisition_date,market_class,treatment,state,id,...,store_city,store_name,store_state,upc_code,unit_name,nutrient_nbr,rank,food_nutrient_id,adjusted_amount,nutrient_name
238825,Unknown,Unknown,food_update_log_entry.csv,id: 2516358 description: sunflower seed kernel...,Unknown,Unknown,Unknown,Unknown,Unknown,2516358.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
277190,Unknown,Unknown,food.csv,fdc_id: 2001022 data_type: sub_sample_food des...,2001022,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
324371,Unknown,Unknown,market_acquisition.csv,fdc_id: 2003633 brand_description: Unknown exp...,2003633,2021-03-15,Unknown,Unknown,Unknown,Unknown,...,(Region),Unknown,West,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
137074,Unknown,Unknown,food_nutrient.csv,id: 24466415 fdc_id: 2001642 nutrient_id: 2009...,2001642,Unknown,Unknown,Unknown,Unknown,24466415.0,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown
360578,2261470.0,Unknown,sub_sample_food.csv,fdc_id: 2262048 fdc_id_of_sample_food: 2261470...,2262048,Unknown,Unknown,Unknown,Unknown,Unknown,...,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown,Unknown


##Preprocessing

In [None]:
# Save the preprocessed data
combined_df.to_csv("preprocessed_nutrition_data.csv", index=False)

In [None]:
# Load the preprocessed data from the CSV file
preprocessed_df = pd.read_csv("preprocessed_nutrition_data.csv")

# Display DataFrame information and a sample of preprocessed data
print("\nPreprocessed Data Info:")
print(preprocessed_df.info())

print("\nSample of preprocessed data:")
print(preprocessed_df.head())


  preprocessed_df = pd.read_csv("preprocessed_nutrition_data.csv")



Preprocessed Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494100 entries, 0 to 494099
Data columns (total 67 columns):
 #   Column                              Non-Null Count   Dtype 
---  ------                              --------------   ----- 
 0   fdc_id_of_sample_food               494100 non-null  object
 1   fdc_id_of_acquisition_food          494100 non-null  object
 2   source_file                         494100 non-null  object
 3   text                                494100 non-null  object
 4   fdc_id                              494100 non-null  object
 5   acquisition_date                    494100 non-null  object
 6   market_class                        494100 non-null  object
 7   treatment                           494100 non-null  object
 8   state                               494100 non-null  object
 9   id                                  494100 non-null  object
 10  name                                494100 non-null  object
 11  description   

#Vectorization (Embeddings)

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model for vectorization
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings using the constructed 'text' column from the combined DataFrame
embeddings = model.encode(combined_df['text'].tolist())

# Save the embeddings
import numpy as np
np.save('nutrition_embeddings.npy', embeddings)

print(f"Embeddings Shape: {embeddings.shape}")



Embeddings Shape: (494100, 384)


##Create a FAISS index (vector db)

In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings.astype('float32'))

# Save FAISS index
faiss.write_index(index, 'nutrition_faiss.index')

print("FAISS index created and saved. Data processing complete.")

FAISS index created and saved. Data processing complete.


In [None]:
print(f"Embeddings Shape: {embeddings.shape}")

Embeddings Shape: (494100, 384)


#Hybrid Search: (OpenAI Text Search & FAISS Vector Search)

In [None]:
import os
import openai

# Retrieve API key from environment variable or use a placeholder
api_key = os.getenv("OPENAI_API_KEY")

if api_key is None:
    api_key = "YOUR_API_KEY"  # Replace with your actual API key or set the environment variable
    print("Warning: Using placeholder API key. Please set the OPENAI_API_KEY environment variable.")

# Initialize the OpenAI client
client = OpenAI(api_key=api_key)

def search_with_openai(query):
    try:
        response = client.Completion.create( # Use client instead of openai
            model="text-davinci-003",
            prompt=f"Search for: {query}",
            max_tokens=100
        )
        return response['choices'][0]['text']
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example text search query
text_search_result = search_with_openai("apple")

# Check if the search result is valid before printing
if text_search_result:
    print(f"Text Search Result:\n{text_search_result}")
else:
    print("No result returned from the search.")


An error occurred: 'OpenAI' object has no attribute 'Completion'
No result returned from the search.


In [None]:
# Hybrid search combining text and vector search
def hybrid_search(query):
    # Step 1: Text search using OpenAI
    print("\n--- Text Search with OpenAI ---")
    text_search_result = search_with_openai(query)
    print(f"Text Search Result: {text_search_result}")

    # Step 2: Vector search using FAISS
    print("\n--- Vector Search with FAISS ---")
    query_embedding = model.encode([query])
    D, I = index.search(np.array(query_embedding, dtype=np.float32), k=5)
    print("Vector Search Results:")
    print(combined_df.iloc[I[0]])

# Example hybrid search with the query 'apple'
hybrid_search("apple")



--- Text Search with OpenAI ---
An error occurred: 'OpenAI' object has no attribute 'Completion'
Text Search Result: None

--- Vector Search with FAISS ---
Vector Search Results:
       fdc_id_of_sample_food fdc_id_of_acquisition_food  \
207391               Unknown                    Unknown   
204799               Unknown                    Unknown   
207414               Unknown                    Unknown   
207339               Unknown                    Unknown   
207349               Unknown                    Unknown   

                      source_file  \
207391  food_update_log_entry.csv   
204799  food_update_log_entry.csv   
207414  food_update_log_entry.csv   
207339  food_update_log_entry.csv   
207349  food_update_log_entry.csv   

                                                     text   fdc_id  \
207391  id: 1752929 description: APPLES, FUJI, WITH SK...  Unknown   
204799  id: 1105897 description: Apples, fuji, with sk...  Unknown   
207414  id: 1752952 description:

#RAG Pipeline (Prompt Building and Response Generation)

## FAISS-based Retrieval and OpenAI Completion

In [None]:
import faiss
from sentence_transformers import SentenceTransformer
import openai
import pandas as pd

# Load OpenAI API Key securely from environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

# Class to handle RAG pipeline
class NutritionRAG:
    def __init__(self, df_nutrients):
        self.df_nutrients = df_nutrients
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.index = self.build_index()  # Build FAISS index using embeddings

    def build_index(self):
        # Ensure 'description' column exists
        texts = self.df_nutrients['description'].tolist()  # Ensure this column exists
        embeddings = self.model.encode(texts)
        index = faiss.IndexFlatL2(embeddings.shape[1])
        index.add(embeddings.astype('float32'))
        return index

    def retrieve(self, query, k=5):
        # Generate the query embedding
        query_vector = self.model.encode([query])
        # Perform the FAISS vector search
        _, indices = self.index.search(query_vector.astype('float32'), k)
        # Retrieve and return the top k documents
        return self.df_nutrients.iloc[indices[0]]

    def generate_response(self, query):
        # Retrieve relevant documents using the vector search
        relevant_docs = self.retrieve(query)
        # Build the context using the 'description' field
        context = "\n".join(relevant_docs['description'].tolist())

        # Construct the prompt for the GPT model
        prompt = f"""Based on the following nutritional information:\n\n{context}\n\nAnswer the following question: {query}\nProvide a concise and informative answer based on the given nutritional information."""

        # Query OpenAI API to generate the response
    def llm(prompt, model='gpt-4o'):
        response = client.chat.completions.create(
        model=model,
        messages=[
                {"role": "system", "content": "You are a helpful nutrition assistant."},
                {"role": "user", "content": prompt}
            ]
        )

        return response.choices[0].message.content



In [None]:
# Example usage
if __name__ == "__main__":
    # Load the preprocessed data from CSV
    df_food = pd.read_csv("preprocessed_nutrition_data.csv")

    # Initialize the RAG system with the data
    rag = NutritionRAG(df_food)

    # Example query
    query = "How many different types of milk are there?"
    response = rag.generate_response(query)
    print(f"Query: {query}")
    print(f"Response: {response}")

  df_food = pd.read_csv("preprocessed_nutrition_data.csv")


Query: How many different types of milk are there?
Response: None


# Evaluation

Types of Evaluations on your Retrieval-Augmented Generation (RAG) system:

- Ground Truth Evaluation: Compares the generated output with a known set of correct answers (ground truth).
- Text-Based Evaluation: Evaluates the accuracy or relevance of the text returned by the OpenAI model (text-based retrieval).
- Vector-Based Evaluation: Evaluates the quality of document retrieval based on vector similarity.
- Offline RAG Evaluation: Involves comparing cosine similarity between query embeddings and document embeddings and potentially using a Large Language Model (LLM) as a judge.

##Eval 1: Ground Truth Evaluation

In [None]:
# Ground Truth Evaluation
import csv

def evaluate_retrieval(rag, queries, k=5):
    model = rag.model
    total_similarity = 0
    results = []

    for query in queries:
        retrieved = rag.retrieve(query, k)
        query_vector = model.encode([query])[0]
        retrieved_vectors = model.encode(retrieved['text'].tolist())
        similarities = cosine_similarity([query_vector], retrieved_vectors)[0]
        total_similarity += np.mean(similarities)
        results.append({"query": query, "similarity": np.mean(similarities)})

    avg_similarity = total_similarity / len(queries)

    # Save results to a CSV file
    with open('ground_truth_eval_results.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["query", "similarity"])
        writer.writeheader()
        writer.writerows(results)

    return avg_similarity

test_queries = [
    "What's the protein content in chicken breast?",
    "How many calories are in an avocado?",
    "What's the nutritional value of spinach?",
    "Compare the fat content of whole milk and skim milk.",
    "What are the macronutrients in a banana?"
]

retrieval_score = evaluate_retrieval(rag, test_queries)
print(f"Average retrieval similarity score: {retrieval_score}")


print("\nEvaluating RAG responses:")
for query in test_queries:
    response = rag.generate_response(query)
    print(f"\nQuery: {query}")
    print(f"Response: {response}")
    rating = input("Rate this response (1-5): ")
    print("\n")

Average retrieval similarity score: 0.42214701771736146

Evaluating RAG responses:

Query: What's the protein content in chicken breast?
Response: None
Rate this response (1-5): 1



Query: How many calories are in an avocado?
Response: None
Rate this response (1-5): 1



Query: What's the nutritional value of spinach?
Response: None
Rate this response (1-5): 1



Query: Compare the fat content of whole milk and skim milk.
Response: None
Rate this response (1-5): 1



Query: What are the macronutrients in a banana?
Response: None
Rate this response (1-5): 1




## Eval 2: Text-Based Evaluation

In [None]:
# Text-Based Evaluation
import csv
from sklearn.metrics import jaccard_score
import re

def clean_text(text):
    return re.sub(r'\W+', ' ', text).lower()

def text_eval(rag_system, queries):
    """
    Evaluate the textual relevance of RAG responses based on retrieved context.

    rag_system: the RAG pipeline instance
    queries: list of queries to test
    """
    results = []
    for query in queries:
        response = rag_system.generate_response(query)
        retrieved_docs = rag_system.retrieve(query)
        context = "\n".join(retrieved_docs['Description'].tolist())
        similarity = jaccard_score(clean_text(response), clean_text(context))
        results.append({
            "query": query,
            "response": response,
            "context": context,
            "similarity": similarity
        })
        print(f"Query: {query}\nResponse: {response}\nContext: {context}")
        print(f"Jaccard Similarity: {similarity}")
        print("-" * 50)

    # Calculate average similarity
    avg_similarity = sum([res["similarity"] for res in results]) / len(results)
    print(f"Average Jaccard Similarity: {avg_similarity:.2f}")

    # Save results to a CSV file
    with open('text_based_eval_results.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["query", "response", "context", "similarity"])
        writer.writeheader()
        writer.writerows(results)

    return avg_similarity

## Eval 3: Vector-Based Evaluation

In [None]:
# Vector-Based Evaluation
import csv
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def vector_eval(rag_system, queries, k=5):
    """
    Evaluate the vector similarity between query and retrieved documents using cosine similarity.

    rag_system: the RAG pipeline instance
    queries: list of queries to test
    k: number of top documents to retrieve
    """
    total_similarity = 0
    model = rag_system.model  # SentenceTransformer model
    results = []

    for query in queries:
        query_embedding = model.encode([query])
        retrieved_docs = rag_system.retrieve(query, k=k)
        retrieved_embeddings = model.encode(retrieved_docs['Description'].tolist())

        similarity = cosine_similarity(query_embedding, retrieved_embeddings).mean()
        total_similarity += similarity

        # Save each result
        results.append({
            "query": query,
            "similarity": similarity
        })

        print(f"Query: {query}")
        print(f"Average Cosine Similarity for retrieved docs: {similarity:.2f}")
        print("-" * 50)

    avg_similarity = total_similarity / len(queries)
    print(f"Average Cosine Similarity across all queries: {avg_similarity:.2f}")

    # Save results to a CSV file
    with open('vector_based_eval_results.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["query", "similarity"])
        writer.writeheader()
        writer.writerows(results)

    return avg_similarity

##Eval 4: Offline RAG Evaluation (Using Cosine Similarity & LLM as a Judge)






In [None]:
import csv
import os
import numpy as np
import openai

# Load OpenAI API Key securely from environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')

# Load the processed data from the CSV file
preprocessed_data = pd.read_csv('preprocessed_nutrition_data.csv')

# Cosine Similarity and LLM-based Evaluation
def offline_rag_eval(nutrition_rag, queries, ground_truth, model='gpt-4o'):
    """
    Evaluate RAG-generated responses using cosine similarity and LLM as a judge.

    nutrition_rag: the RAG pipeline instance
    queries: list of queries to test
    ground_truth: list of correct answers (from dataset)
    model: LLM model to use for evaluation (default is 'gpt-4o')
    """
    total_cosine_similarity = 0
    evaluations = []

    for i, query in enumerate(queries):
        response = nutrition_rag.generate_response(query)  # Call the instance method
        original_text = ground_truth[i]

        # Calculate Cosine Similarity between response and original text
        v_response = nutrition_rag.model.encode(response)  # Use the nutrition_rag instance
        v_original = nutrition_rag.model.encode(original_text)  # Use the nutrition_rag instance
        cosine_sim = np.dot(v_response, v_original) / (np.linalg.norm(v_response) * np.linalg.norm(v_original))

        # LLM-based evaluation using OpenAI API
        prompt = f"""
        You're evaluating the relevance of a generated response.
        Original Text: "{original_text}"
        Generated Response: "{response}"
        Please evaluate the relevance between the two.
        """
        llm_response = client.Completion.create(model=model, prompt=prompt, max_tokens=100)
        eval_result = llm_response['choices'][0]['text'].strip()

        evaluations.append({
            'query': query,
            'response': response,
            'original_text': original_text,
            'cosine_similarity': cosine_sim,
            'LLM_evaluation': eval_result
        })
        total_cosine_similarity += cosine_sim

        # Display the results for the current query
        print(f"Query: {query}")
        print(f"Cosine Similarity: {cosine_sim:.2f}")
        print(f"LLM Evaluation: {eval_result}")
        print("-" * 50)

    avg_cosine_similarity = total_cosine_similarity / len(queries)

    # Display the average cosine similarity for the entire evaluation
    print(f"Average Cosine Similarity: {avg_cosine_similarity:.2f}")

    # Save results to a CSV file
    with open('offline_rag_eval_results.csv', mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["query", "response", "original_text", "cosine_similarity", "LLM_evaluation"])
        writer.writeheader()
        writer.writerows(evaluations)

    return evaluations, avg_cosine_similarity

# Call the evaluation function
evaluations, avg_cosine = offline_rag_eval(nutrition_rag, queries, ground_truth)


# Monitoring

## Interactive Monitoring

In [None]:
import csv

class MonitoredRAG:
    def __init__(self, rag_system):
        self.rag_system = rag_system  # Your RAG system with FAISS and OpenAI
        self.query_count = 0
        self.response_times = []
        self.user_ratings = []

    def generate_response(self, query):
        start_time = datetime.now()

        # Generate response using FAISS and OpenAI
        response = self.rag_system.generate_response(query)  # Assumes rag_system has 'generate_response'

        end_time = datetime.now()
        # Track response time
        response_time = (end_time - start_time).total_seconds()

        # Update counters
        self.query_count += 1
        self.response_times.append(response_time)

        # Save the response time to a CSV file
        with open('monitoring_response_times.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([query, response_time])

        return response

    def add_rating(self, rating):
        self.user_ratings.append(rating)

        # Save the user rating to a CSV file
        with open('monitoring_user_ratings.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([self.query_count, rating])

# Initialize the monitoring system with your RAG system (FAISS + OpenAI)
monitored_rag = MonitoredRAG(rag_system)

# User Input Interface
query_input = widgets.Text(description="Query:", style={'description_width': 'initial'})
submit_button = widgets.Button(description="Submit")
rating_slider = widgets.IntSlider(value=3, min=1, max=5, step=1, description="Rate (1-5):")
output = widgets.Output()

# Handle submission
def on_submit_button_clicked(b):
    with output:
        clear_output()
        print(f"Query: {query_input.value}")
        response = monitored_rag.generate_response(query_input.value)
        print(f"Response: {response}")

# Handle rating updates
def on_rating_change(change):
    if change['type'] == 'change' and change['name'] == 'value':
        monitored_rag.add_rating(change['new'])

submit_button.on_click(on_submit_button_clicked)
rating_slider.observe(on_rating_change)

# Display the monitoring interface
display(widgets.VBox([query_input, submit_button, rating_slider, output]))

## Real Time Dashboard

In [None]:
dashboard_output = widgets.Output()

def update_dashboard():
    with dashboard_output:
        clear_output()
        print(f"Total Queries: {monitored_rag.query_count}")

        # Save query count and response times to CSV
        with open('monitoring_dashboard.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([monitored_rag.query_count, monitored_rag.response_times])

        if monitored_rag.response_times:
            fig_times = px.line(x=range(len(monitored_rag.response_times)), y=monitored_rag.response_times, title="Response Times")
            fig_times.show()
        if monitored_rag.user_ratings:
            avg_rating = sum(monitored_rag.user_ratings) / len(monitored_rag.user_ratings)
            fig_ratings = px.histogram(monitored_rag.user_ratings, title="User Ratings Distribution")
            print(f"Average User Rating: {avg_rating:.2f}")
            fig_ratings.show()

# Create dashboard update button
update_dashboard_button = widgets.Button(description="Update Dashboard")
update_dashboard_button.on_click(update_dashboard)

# Display the dashboard interface
display(widgets.VBox([update_dashboard_button, dashboard_output]))

##Download files

In [None]:
from google.colab import files

# List of all files to download (with /content/ path added)
file_paths = [
    '/content/csv_file_columns.csv',
    '/content/preprocessed_nutrition_data.csv',
    '/content/nutrition_embeddings.npy',
    '/content/nutrition_faiss.index',
    '/content/ground_truth_eval_results.csv',
    '/content/text_based_eval_results.csv',
    '/content/vector_based_eval_results.csv',
    '/content/offline_rag_eval_results.csv',
    '/content/monitoring_response_times.csv',
    '/content/monitoring_user_ratings.csv',
    '/content/monitoring_dashboard.csv'
]

# Loop through the list and download each file
for file_path in file_paths:
    try:
        # Download each file one by one
        files.download(file_path)
        print(f"Downloaded: {file_path}")
    except Exception as e:
        print(f"Failed to download {file_path}: {e}")