# Creating Master Dataframe for Modeling

In [1]:
import pandas as pd

**Load Each CSV File**

In [6]:
# Load each CSV file
menu_df = pd.read_csv('./data/menu_df.csv')
diabetic_food_df = pd.read_csv('./data/diabetic_friendly_foods.csv')
gi_df = pd.read_csv('./data/glycemic_index.csv')
patient_df = pd.read_csv('./data/patient.csv')
dexcom_cgm_df = pd.read_csv('./data/dexcom_cgm.csv')

In [7]:
# Inspect each DataFrame to determine the best way to combine them
print("Menu Data:", menu_df.shape)
print("Diabetic Food Data:", diabetic_food_df.shape)
print("Glycemic Index Data:", gi_df.shape)
print("Patient Data:", patient_df.shape)
print("Dexcom CGM Data:", dexcom_cgm_df.shape)

Menu Data: (100, 14)
Diabetic Food Data: (365, 11)
Glycemic Index Data: (54, 2)
Patient Data: (33568, 22)
Dexcom CGM Data: (17118, 2)


In [13]:
print(diabetic_food_df.columns)
print(gi_df.columns)
print(menu_df.columns)
print(patient_df.columns)
print(dexcom_cgm_df.columns)

Index(['food_name', 'category', 'description', 'brand', 'food_category',
       'calories', 'carbohydrates', 'fiber', 'sugars', 'fats', 'proteins'],
      dtype='object')
Index(['food_name', 'glycemic_index'], dtype='object')
Index(['restaurant_name', 'food_name', 'serving_size', 'serving_unit',
       'calories', 'carbohydrates', 'sugars', 'fats', 'saturated_fats',
       'cholesterol', 'sodium', 'fiber', 'potassium', 'proteins', 'category',
       'description', 'brand', 'food_category', 'glycemic_index'],
      dtype='object')
Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')
Index(['displayTime', 'Glucose Value'], dtype='object')


Initial Standardization and Cleaning to ensure data can be merged

In [None]:
# Rename 'protein' in menu_df to 'proteins' for consistency with diabetic_food_df
menu_df.rename(columns={'protein': 'proteins'}, inplace=True)
# Convert relevant columns to numeric, setting errors='coerce' to handle any non-numeric data
numeric_columns = ['calories', 'carbohydrates', 'sugars', 'fats', 'saturated_fats', 'cholesterol', 'sodium', 'fiber', 'potassium', 'proteins']
menu_df[numeric_columns] = menu_df[numeric_columns].apply(pd.to_numeric, errors='coerce')

Create keys to merge each dataset properly

In [20]:
# Nutrient-Based Key for food_name in menu_df and diabetic_food_df
# Standardize by lowercasing, strip whitespace, etc.
menu_df['food_name'] = menu_df['food_name'].str.lower().str.strip()
diabetic_food_df['food_name'] = diabetic_food_df['food_name'].str.lower().str.strip()
gi_df['food_name'] = gi_df['food_name'].str.lower().str.strip()

# Patient-Based Keys have standardized formatting
patient_df['Age'] = patient_df['Age'].astype(int)  # integer format
patient_df['Sex'] = patient_df['Sex'].astype(int)  # Standardize 'Sex' column

**Merge Data**

In [22]:
# Merge menu_df and diabetic_food_df on 'food_name'
food_data = pd.merge(menu_df, diabetic_food_df, on='food_name', how='outer', suffixes=('_menu', '_diabetic'))

# Merge glycemic index data with food_data based on 'food_name'
master_food_data = pd.merge(food_data, gi_df[['food_name', 'glycemic_index']], on='food_name', how='left')

# To merge the data for multiple patients, patient data is replicated for each food record
# Repeat patient data across all food records
expanded_patient_df = patient_df.loc[patient_df.index.repeat(len(master_food_data))].reset_index(drop=True)

# Now, expanded_patient_df contains each patient's data repeated for every food record
# Merge food data with the expanded patient data
merged_data = pd.concat([master_food_data, expanded_patient_df], axis=1)

In [23]:
# Validate data before adding CGM data
merged_data.head()

Unnamed: 0,restaurant_name,food_name,serving_size,serving_unit,calories_menu,carbohydrates_menu,sugars_menu,fats_menu,saturated_fats,cholesterol,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,,almonds,,,,,,,,,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0
1,,almonds,,,,,,,,,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0
2,,almonds,,,,,,,,,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0
3,,almonds,,,,,,,,,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0
4,,almonds,,,,,,,,,...,1,0.0,2.0,0.0,0.0,0.0,0,11,4.0,3.0


In [24]:
merged_data.shape

(15609120, 52)