# Creating Master Dataframe for Modeling

In [36]:
import pandas as pd
import re

### Load Data

In [37]:
# Load each CSV file
menu_df = pd.read_csv('./data/menu_df.csv')
diabetic_food_df = pd.read_csv('./data/diabetic_friendly_foods.csv')
gi_df = pd.read_csv('./data/glycemic_index.csv')
patient_df = pd.read_csv('./data/patient.csv')
dexcom_cgm_df = pd.read_csv('./data/dexcom_cgm.csv')

# Inspect each DataFrame to determine the best way to combine them
print("Menu Data:", menu_df.shape)
print("Diabetic Food Data:", diabetic_food_df.shape)
print("Glycemic Index Data:", gi_df.shape)
print("Patient Data:", patient_df.shape)
print("Dexcom CGM Data:", dexcom_cgm_df.shape)

Menu Data: (100, 14)
Diabetic Food Data: (365, 11)
Glycemic Index Data: (54, 2)
Patient Data: (33568, 22)
Dexcom CGM Data: (17118, 2)


## Merging Food Data

### Inspect Columns and Food Names

In [38]:
print(diabetic_food_df.columns)
print(gi_df.columns)
print(menu_df.columns)
print(patient_df.columns)
print(dexcom_cgm_df.columns)

Index(['food_name', 'category', 'description', 'brand', 'food_category',
       'calories', 'carbohydrates', 'fiber', 'sugars', 'fats', 'proteins'],
      dtype='object')
Index(['food_name', 'glycemic_index'], dtype='object')
Index(['restaurant_name', 'food_name', 'serving_size', 'serving_unit',
       'calories', 'carbohydrates', 'sugars', 'fats', 'saturated_fats',
       'cholesterol', 'sodium', 'fiber', 'potassium', 'proteins'],
      dtype='object')
Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Index(['displayTime', 'Glucose Value'], dtype='object')


In [39]:
# Inspect unique food names in each dataset
print("Unique food names in Menu Data:")
print(menu_df['food_name'].unique())

print("\nUnique food names in Diabetic Food Data:")
print(diabetic_food_df['food_name'].unique())

print("\nUnique food names in Glycemic Index Data:")
print(gi_df['food_name'].unique())

Unique food names in Menu Data:
["Egg BLT McMuffin with Shredded Lettuce (McDonald's in Walmart)"
 'Cheeseburger' 'Hamburger' 'Honey' 'Hotcakes' 'McChicken' 'McCrispy'
 'McDouble' 'Americano, Large' 'Americano, Medium' 'Americano, Small'
 'Apple Slices' 'Big Breakfast' 'Big Mac' 'Cappuccino, Large'
 'Cappuccino, Medium' 'Cappuccino, Small' 'Dasani Water' 'Deluxe McCrispy'
 'Double Cheeseburger' 'Bacon King' 'Rodeo Burger' 'Whopper'
 'Bacon Cheeseburger' 'Bacon Melt' 'Big Fish' 'Chicken Jr.'
 'Chocolate Shake' 'Classic Melt' 'Double Whopper' 'Impossible Whopper'
 'Large Sprite' 'MOTTS Applesauce' 'Medium Sprite' 'Pancake Platter'
 'Philly Melt' 'Rodeo Cheeseburger'
 'Taco Shells, Bell Essentials, Crunchy' 'Crunchy Taco'
 'Nachos BellGrande - Beef' 'Nachos BellGrande - Chicken'
 'Nachos BellGrande - Steak' 'Crunchy Taco Supreme' 'Double Decker Taco'
 'Double Stacked Taco' 'Soft Taco - Beef' 'Soft Taco - Chicken'
 'Fries Bell Grande' 'Nachos Bell Grande' 'Seasoning Mix, Original Taco'
 'M

### Separating Grams from Food Names in Glycemic Index

In [40]:
# Extract weight (grams) from food_name
def extract_weight(name):
    match = re.search(r"\((.*?)\)", name)
    return match.group(1) if match else None

# Apply to glycemic index dataset
gi_df['serving_weight'] = gi_df['food_name'].apply(extract_weight)

# Remove the weight from food_name
gi_df['food_name'] = gi_df['food_name'].str.replace(r"\(.*?\)", "", regex=True).str.strip()

# Verify results
print(gi_df.head())

         food_name  glycemic_index serving_weight
0           Apples            40.0           120g
1      Apple juice            39.0           250g
2  Apricots, dried            32.0            60g
3          Bananas            47.0           120g
4   Fruit cocktail            55.0           120g


### Standardize Food Names Across Datasets

In [41]:
# Rename 'protein' in menu_df to 'proteins' for consistency with diabetic_food_df
menu_df.rename(columns={'protein': 'proteins'}, inplace=True)

# Standardize food_name in all datasets
for df in [gi_df, menu_df, diabetic_food_df]:
    df['food_name'] = df['food_name'].str.lower().str.strip()

# Verify unique food names after standardization
print("Unique food names in Glycemic Index:", gi_df['food_name'].unique())
print("Unique food names in Menu Data:", menu_df['food_name'].unique())
print("Unique food names in Diabetic Food Data:", diabetic_food_df['food_name'].unique())

Unique food names in Glycemic Index: ['apples' 'apple juice' 'apricots, dried' 'bananas' 'fruit cocktail'
 'grapefruit' 'grapes' 'mangoes' 'oranges, raw'
 'peaches, canned in light syrup' 'pineapple' 'plums' 'strawberries'
 'carrot juice' 'carrots, raw' 'corn, sweet' 'lima beans, baby, frozen'
 'parsnips, peeled boiled' 'potato, white, boiled' 'tomato soup' 'barley'
 'basmati rice' 'bran cereal' 'brown rice, steamed'
 'bulgur wheat, whole, cooked' 'chickpeas' 'instant noodles'
 'instant oatmeal' 'mixed grain bread' 'oat bran bread' 'rye kernel bread'
 'rye flour bread, 50%\xa0 rye flour, 50% wheat flour'
 'water crackers, whole grain, sesame seeds' 'white rice, boiled'
 'skim milk' 'soy milk' 'black beans' 'butter beans' 'cashews'
 'kidney beans' 'kidney beans, canned' 'lentils, canned'
 'split peas, yellow, boiled' 'blueberry muffin' 'cake, pound'
 'corn chips' 'hummus' 'ice cream, full-fat, french vanilla'
 'ice cream, low-fat, vanilla, “light”' 'oatmeal cookies' 'snickers'
 'sponge 

### Align Columns Across Datasets

In [42]:
# Ensure all datasets have the same columns
all_columns = set(gi_df.columns) | set(menu_df.columns) | set(diabetic_food_df.columns)
for df in [gi_df, menu_df, diabetic_food_df]:
    for col in all_columns:
        if col not in df.columns:
            df[col] = pd.NA

# Verify alignment
print("Glycemic Index Columns:", gi_df.columns)
print("Menu Data Columns:", menu_df.columns)
print("Diabetic Food Data Columns:", diabetic_food_df.columns)



Glycemic Index Columns: Index(['food_name', 'glycemic_index', 'serving_weight', 'serving_size',
       'sugars', 'proteins', 'restaurant_name', 'serving_unit', 'calories',
       'category', 'sodium', 'carbohydrates', 'brand', 'fats', 'potassium',
       'food_category', 'saturated_fats', 'description', 'fiber',
       'cholesterol'],
      dtype='object')
Menu Data Columns: Index(['restaurant_name', 'food_name', 'serving_size', 'serving_unit',
       'calories', 'carbohydrates', 'sugars', 'fats', 'saturated_fats',
       'cholesterol', 'sodium', 'fiber', 'potassium', 'proteins',
       'serving_weight', 'category', 'brand', 'food_category', 'description',
       'glycemic_index'],
      dtype='object')
Diabetic Food Data Columns: Index(['food_name', 'category', 'description', 'brand', 'food_category',
       'calories', 'carbohydrates', 'fiber', 'sugars', 'fats', 'proteins',
       'serving_size', 'serving_weight', 'restaurant_name', 'serving_unit',
       'sodium', 'potassium', 'satu

### Merge Data

In [43]:
# Merge Glycemic Index and Diabetic Food Data
food_gi_combined = pd.merge(diabetic_food_df, gi_df, on='food_name', how='outer')

# Add Menu Data
combined_food_data = pd.merge(food_gi_combined, menu_df, on='food_name', how='outer')

# Inspect the merged dataset
print("Combined Food Data:")
print(combined_food_data.head())

Combined Food Data:
  food_name               category_x description_x            brand_x  \
0   almonds  Diabetic-Friendly Foods       ALMONDS      HARVEST FRESH   
1   almonds  Diabetic-Friendly Foods       ALMONDS  TREASURED HARVEST   
2   almonds  Diabetic-Friendly Foods       ALMONDS          EILLIEN'S   
3   almonds  Diabetic-Friendly Foods       ALMONDS         O ORGANICS   
4   almonds  Diabetic-Friendly Foods       ALMONDS             MEIJER   

                            food_category_x  calories_x  carbohydrates_x  \
0  Popcorn, Peanuts, Seeds & Related Snacks       633.0             13.3   
1  Popcorn, Peanuts, Seeds & Related Snacks       603.0             20.0   
2  Popcorn, Peanuts, Seeds & Related Snacks       583.0             16.7   
3  Popcorn, Peanuts, Seeds & Related Snacks       567.0             20.0   
4  Popcorn, Peanuts, Seeds & Related Snacks       588.0             20.6   

   fiber_x  sugars_x  fats_x  ...  sodium fiber potassium proteins  \
0     10.0    

### Handle Duplicates

In [44]:
# Check for duplicates
duplicates = combined_food_data[combined_food_data.duplicated(subset='food_name', keep=False)]
print("Duplicate food names:")
print(duplicates)

Duplicate food names:
    food_name               category_x  \
0     almonds  Diabetic-Friendly Foods   
1     almonds  Diabetic-Friendly Foods   
2     almonds  Diabetic-Friendly Foods   
3     almonds  Diabetic-Friendly Foods   
4     almonds  Diabetic-Friendly Foods   
..        ...                      ...   
505  zucchini  Diabetic-Friendly Foods   
506  zucchini  Diabetic-Friendly Foods   
507  zucchini  Diabetic-Friendly Foods   
508  zucchini  Diabetic-Friendly Foods   
509  zucchini  Diabetic-Friendly Foods   

                                         description_x              brand_x  \
0                                              ALMONDS        HARVEST FRESH   
1                                              ALMONDS    TREASURED HARVEST   
2                                              ALMONDS            EILLIEN'S   
3                                              ALMONDS           O ORGANICS   
4                                              ALMONDS               MEIJER   

### Aggregate Numeric Columns

In [45]:
# Defining aggregation rules
numeric_aggregation = {
    'calories_x': 'mean',
    'carbohydrates_x': 'mean',
    'fiber_x': 'mean',
    'fats_x': 'mean',
    'glycemic_index': 'mean',
}
non_numeric_aggregation = {col: 'first' for col in combined_food_data.columns if col not in numeric_aggregation and col != 'food_name'}
aggregation_rules = {**numeric_aggregation, **non_numeric_aggregation}

# Aggregate data
aggregated_data = combined_food_data.groupby('food_name').agg(aggregation_rules).reset_index()

# Inspect aggregated data
print("Aggregated DataFrame with All Columns:")
print(aggregated_data.head())



Aggregated DataFrame with All Columns:
           food_name  calories_x  carbohydrates_x  fiber_x  fats_x  \
0            almonds       594.8            18.12     11.9    51.9   
1   americano, large         NaN              NaN      NaN     NaN   
2  americano, medium         NaN              NaN      NaN     NaN   
3   americano, small         NaN              NaN      NaN     NaN   
4        apple bites         NaN              NaN      NaN     NaN   

  glycemic_index               category_x description_x        brand_x  \
0            NaN  Diabetic-Friendly Foods       ALMONDS  HARVEST FRESH   
1            NaN                     None          None           None   
2            NaN                     None          None           None   
3            NaN                     None          None           None   
4            NaN                     None          None           None   

                            food_category_x  ...  cholesterol  sodium fiber  \
0  Popcorn, Pean

### Normalize Nutritional Values

In [46]:
# Calculate per-100g values where serving_weight is available
aggregated_data['serving_weight'] = pd.to_numeric(aggregated_data['serving_weight'], errors='coerce')
valid_weight = aggregated_data['serving_weight'] > 0

aggregated_data.loc[valid_weight, 'calories_per_100g'] = (
    aggregated_data.loc[valid_weight, 'calories_x'] / aggregated_data.loc[valid_weight, 'serving_weight'] * 100
)
aggregated_data.loc[valid_weight, 'carbs_per_100g'] = (
    aggregated_data.loc[valid_weight, 'carbohydrates_x'] / aggregated_data.loc[valid_weight, 'serving_weight'] * 100
)
aggregated_data.loc[valid_weight, 'fats_per_100g'] = (
    aggregated_data.loc[valid_weight, 'fats_x'] / aggregated_data.loc[valid_weight, 'serving_weight'] * 100
)


Categorize Glycemic Index

In [47]:
# Categorize based on glycemic index
def categorize_gi(value):
    if pd.isna(value):
        return 'Unknown'
    elif value <= 55:
        return 'Low'
    elif value <= 70:
        return 'Medium'
    else:
        return 'High'

aggregated_data['gi_category'] = aggregated_data['glycemic_index'].apply(categorize_gi)


### Final Review

In [48]:
# Review final dataset
print("Final Dataset Shape:", aggregated_data.shape)
print(aggregated_data.head())

Final Dataset Shape: (213, 62)
           food_name  calories_x  carbohydrates_x  fiber_x  fats_x  \
0            almonds       594.8            18.12     11.9    51.9   
1   americano, large         NaN              NaN      NaN     NaN   
2  americano, medium         NaN              NaN      NaN     NaN   
3   americano, small         NaN              NaN      NaN     NaN   
4        apple bites         NaN              NaN      NaN     NaN   

  glycemic_index               category_x description_x        brand_x  \
0            NaN  Diabetic-Friendly Foods       ALMONDS  HARVEST FRESH   
1            NaN                     None          None           None   
2            NaN                     None          None           None   
3            NaN                     None          None           None   
4            NaN                     None          None           None   

                            food_category_x  ...  proteins  serving_weight  \
0  Popcorn, Peanuts, Seed

### Export Merged data to CSV

In [50]:
# Export the final aggregated data to a CSV file
output_path = './data/combined_food_data.csv'
aggregated_data.to_csv(output_path, index=False)

print(f"Final combined data exported to: {output_path}")

Final combined data exported to: ./data/combined_food_data.csv


## Merging Patient Data

In [None]:
# # Patient-Based Keys have standardized formatting
# patient_df['Age'] = patient_df['Age'].astype(int)  # integer format
# patient_df['Sex'] = patient_df['Sex'].astype(int)  # Standardize 'Sex' column