### NutriChat - Data Preprocessing and Parsing
This notebook handles the initial data processing for our nutrition facts dataset:
1. Loading and cleaning raw nutrition data
2. Converting data types and handling trace amounts
3. Structuring data for RAG system use
4. Creating processed JSON output for downstream tasks

#### Imports

In [1]:
import io
import json
import requests
import pandas as pd  
from io import StringIO  

#### Data Cleaning Functions

In [9]:
def clean_nutrition_data(df):
    """Clean and format nutrition data"""
    df = df.copy()
    
    # Handle trace amounts
    df = df.replace('t', 'trace')
    
    # Handle null values
    df = df.fillna(0)
    
    # Ensure numeric columns are float
    numeric_columns = ['Grams', 'Calories', 'Protein', 'Fat', 
                      'SatFat', 'Fiber', 'Carbs']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

def create_nutrition_documents(df):
    """Convert DataFrame to document format"""
    documents = []
    
    for _, row in df.iterrows():
        doc = {
            "Food": row['Food'],
            "Measure": row['Measure'],
            "Grams": row['Grams'],
            "Calories": row['Calories'],
            "Protein": row['Protein'],
            "Fat": row['Fat'],
            "SatFat": row['SatFat'],
            "Fiber": row['Fiber'],
            "Carbs": row['Carbs'],
            "Category": row['Category']
        }
        documents.append(doc)
    
    return {"nutritionfacts": documents}

#### Loading & Processing

In [10]:
# Load data
csv_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat-End-to-End-RAG-/refs/heads/main/Data/nutrients_csvfile.csv'
response = requests.get(csv_url)
df = pd.read_csv(StringIO(response.text))

In [11]:
print(df.columns)

Index(['Food', 'Measure', 'Grams', 'Calories', 'Protein', 'Fat', 'SatFat',
       'Fiber', 'Carbs', 'Category'],
      dtype='object')


In [12]:
# Main processing
df_clean = clean_nutrition_data(df)
nutrition_docs = create_nutrition_documents(df_clean)

In [13]:
# Add after loading data
print("\nDataset statistics:")
print(f"Total number of food items: {len(nutrition_docs['nutritionfacts'])}")
print(f"Categories found: {df['Category'].unique()}")


Dataset statistics:
Total number of food items: 335
Categories found: ['Dairy products' 'Fats, Oils, Shortenings' 'Meat, Poultry'
 'Fish, Seafood' 'Vegetables A-E' 'Vegetables F-P' 'Vegetables R-Z'
 'Fruits A-F' 'Fruits G-P' 'Fruits R-Z' 'Breads, cereals, fastfood,grains'
 'Soups' 'Desserts, sweets' 'Jams, Jellies' 'Seeds and Nuts'
 'Drinks,Alcohol, Beverages']


In [19]:
# Save processed data
with open('nutrichat-nutritionfacts.json', 'w') as f:
    json.dump(nutrition_docs, f, indent=2)

#### Verification

In [16]:
print(json.dumps(nutrition_docs["nutritionfacts"][0], indent=2))

{
  "Food": "Cows' milk",
  "Measure": "1 qt.",
  "Grams": 976.0,
  "Calories": 660.0,
  "Protein": 32.0,
  "Fat": 40.0,
  "SatFat": 36.0,
  "Fiber": 0.0,
  "Carbs": 48.0,
  "Category": "Dairy products"
}


In [17]:
import os
print(os.getcwd())

/workspaces/NutriChat-End-to-End-RAG-/Notebooks
