### NutriChat - Data Preprocessing and Parsing
This notebook handles the initial data processing for our nutrition facts dataset:
1. Loading and cleaning raw nutrition data
2. Converting data types and handling trace amounts
3. Structuring data for RAG system use
4. Creating processed JSON output for downstream tasks

#### Imports

In [None]:
import io
import json
import requests
import pandas as pd  
from io import StringIO  

#### Data Cleaning Functions

In [24]:
# Read and clean nutrition data
def clean_nutrition_data(df):
    """Clean and format nutrition data"""
    df = df.copy()
    
    # Handle trace amounts
    df = df.replace('t', 'trace')
    
    # Handle null values
    df = df.fillna(0)
    
    # Ensure numeric columns are float
    numeric_columns = ['Grams', 'Calories', 'Protein', 'Fat', 
                      'Sat.Fat', 'Fiber', 'Carbs']
    for col in numeric_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

# Convert to desired format
def create_nutrition_documents(df):
    """Convert DataFrame to document format"""
    documents = []
    
    for _, row in df.iterrows():
        doc = {
            "Food": row['Food'],
            "Measure": row['Measure'],
            "Grams": row['Grams'],
            "Calories": row['Calories'],
            "Protein": row['Protein'],
            "Fat": row['Fat'],
            "Sat.Fat": row['Sat.Fat'],
            "Fiber": row['Fiber'],
            "Carbs": row['Carbs'],
            "Category": row['Category']
        }
        documents.append(doc)
    
    return {"nutritionfacts": documents}

#### Loading & Processing

In [None]:
# Load data
csv_url = 'https://raw.githubusercontent.com/milanimcgraw/NutriChat/refs/heads/main/nutrients_csvfile.csv'
response = requests.get(csv_url)
df = pd.read_csv(StringIO(response.text))

# Main processing
df_clean = clean_nutrition_data(df)
nutrition_docs = create_nutrition_documents(df_clean)

In [None]:
# Add after loading data
print("\nDataset statistics:")
print(f"Total number of food items: {len(nutrition_docs['nutritionfacts'])}")
print(f"Categories found: {df['Category'].unique()}")

In [32]:
# Save processed data
with open('nutritionfacts.json', 'w') as f:
    json.dump(nutrition_docs, f, indent=2)

#### Verification

In [None]:
# Display sample to verify
print("Sample document:")

In [None]:
print(json.dumps(nutrition_docs["nutritionfacts"][0], indent=2))