# Modeling

### Imports

In [46]:
import pandas as pd
import numpy as np
import zipfile
import io

#### Read in Preprocessed Data

In [4]:
# Restaurant Menu Data
menu_df = pd.read_csv('./preprocessed_data/menu_df.csv')
menu_df.head()

Unnamed: 0,restaurant_name,food_name,serving_size,serving_unit,calories,carbohydrates,sugars,fats,saturated_fats,cholesterol,sodium,fiber,potassium,proteins,carb_percent,fat_percent,protein_percent,score
0,McDonald's Canada,Egg BLT McMuffin with Shredded Lettuce (McDona...,1.0,Serving,7.99,1.55,0.56,0.14,0.02,0.0,3.76,0.99,116.09,0.58,77.596996,15.769712,29.036295,0.308
1,McDonald's,Cheeseburger,1.0,Serving,535.31,39.24,7.16,28.66,14.0,95.52,1176.09,2.39,443.77,30.27,29.321328,48.185164,22.61867,0.519
2,McDonald's,Hamburger,1.0,Serving,540.14,40.27,7.16,26.56,10.52,122.04,791.0,0.03,569.52,34.28,29.821898,44.255193,25.386011,0.498
3,McDonald's,Honey,1.0,Serving,63.84,17.3,17.25,0.0,0.0,0.0,0.84,0.04,10.92,0.06,108.39599,0.0,0.37594,0.166
4,McDonald's,Hotcakes,1.0,Serving,90.8,11.32,7.16,3.88,0.85,23.6,175.6,0.03,52.8,2.56,49.867841,38.45815,11.277533,0.135


In [6]:
# Individual Food Data
individual_foods_df = pd.read_csv('./preprocessed_data/individual_foods_df.csv')

individual_foods_df.head()

Unnamed: 0,food_name,category,description,food_category,calories,carbohydrates,fiber,sugars,fats,proteins,score
0,Almond Butter,Individual Foods,ALMOND BUTTER,Nut & Seed Butters,587.8,22.5,11.0,6.992,50.98,21.32,0.4612
1,Almond Flour,Individual Foods,ALMOND FLOUR,Flours & Corn Meal,604.8,19.64,9.22,6.348,52.08,21.18,0.4304
2,Almonds,Individual Foods,ALMONDS,"Popcorn, Peanuts, Seeds & Related Snacks",594.8,18.12,11.9,3.142,51.9,20.66,0.5462
3,Apples,Individual Foods,APPLES,Pre-Packaged Fruit & Vegetables,52.8,14.18,2.76,10.48,0.0,0.554,0.1746
4,Asparagus,Individual Foods,ASPARAGUS,Pre-Packaged Fruit & Vegetables,21.0,3.92,2.36,2.076,0.0,2.076,0.296


In [8]:
# Patient Data
patient_df = pd.read_csv('./preprocessed_data/patient_df.csv')
patient_df.head()

Unnamed: 0,ID,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,MentHlth,PhysHlth,Sex,Age,Glucose Value,Time Checked,GlucoseRank,Cluster
0,0,2.0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,19.0,16.0,1,0.333333,96,2024-09-19T18:28:15,Norm,2
1,1,2.0,1,1.0,1,26.0,1.0,0.0,0.0,1,4.0,0.0,15.0,1,1.0,138,2024-10-30T12:38:16,High,2
2,2,2.0,0,1.0,0,38.0,1.0,0.0,0.0,0,4.0,30.0,30.0,1,0.833333,148,2024-09-23T18:23:26,High,1
3,3,2.0,0,0.0,0,24.0,0.0,0.0,0.0,1,2.0,0.0,10.0,1,0.833333,99,2024-10-31T04:58:18,Norm,2
4,4,2.0,0,1.0,1,21.0,1.0,0.0,1.0,1,3.0,0.0,0.0,0,0.75,161,2024-09-25T15:08:29,High,2


### Compute Scores for Recommendations

While we already have a score that captures the suitability of a meal/food item based on general diabetes nutrition recommendations, this patient-specific score uses that general score in combination with personalized factors to suggest the best meal options for a person.

In [18]:
# # Patient-Specific Score Function
def patient_suitability_score(patient_data, food_data):
    # Patient Info
    glucose_rank = patient_data['GlucoseRank']
    high_bp = patient_data['HighBP']
    high_chol = patient_data['HighChol']
    smoker = patient_data['Smoker']
    stroke = patient_data['Stroke']
    heart = patient_data['HeartDiseaseorAttack']
    bmi = patient_data['BMI']

    # Food Info
    carbohydrates = food_data['carbohydrates']
    sugars = food_data['sugars']
    fiber = food_data['fiber']
    proteins = food_data['proteins']
    # None for individual food, where sodium isn't available
    sodium = food_data.get('sodium', None)
    fat = food_data['fats']

    # Base Weights
    base_weight = np.array([0.45, 0.2, 0.15, 0.1, 0.05, 0.05])
    # Base nutrition goals
    goals = np.array([52.5, 7.5, 7.5, 20, 765, 25])

    # Glucose Level Adjustments
    if glucose_rank == 'High':
        goals[1] = 2.5 
    elif glucose_rank == 'Low':
        goals[1] = 1.5

    # Adjustments for high blood pressure
    if high_bp:
        goals[1] = 2.5  # Adjust sugar goal
        goals[4] = 500  # Adjust sodium goal
        base_weight += np.array([0.3, 0.25, 0.15, 0.05, 0.2, 0.05])

    # Adjustments for high cholesterol
    if high_chol:
        goals[2] = 10   # Adjust fiber goal
        goals[1] = 2.5  # Adjust sugar goal
        goals[5] = 15   # Adjust fat goal
        base_weight += np.array([0.3, 0.25, 0.25, 0.1, 0.05, 0.05])

    # Adjustments for smokers
    if smoker:
        base_weight += np.array([0.25, 0.25, 0.25, 0.1, 0.1, 0.05])

    # Adjustments for stroke or heart attacks
    if stroke or heart:
        base_weight += np.array([0.25, 0.15, 0.25, 0.1, 0.15, 0.1])

    # Average Weights
    avg_weights = base_weight / base_weight.sum()

    # Scores Calculation
    scores = np.array([
        1 - abs(carbohydrates - goals[0]) / goals[0],  # Carb Score
        1 - sugars / goals[1],                        # Sugar Score
        min(fiber / goals[2], 1),                     # Fiber Score
        min(proteins / goals[3], 1),                  # Protein Score
        1 - (sodium / goals[4]) if sodium is not None else 1,  # Sodium Score
        1 - fat / goals[5]                            # Fat Score
    ])

    scores = np.maximum(scores, 0)

    # Weighted Score
    final_score = (avg_weights * scores).sum()

    # BMI Penalty
    bmi_penalty = 0.025 * (bmi - 25)
    final_score = max(0, final_score - bmi_penalty)

    return round(final_score, 3)

#### Menu Data Patient Scores

In [26]:
# Initialize empty df for recommendations
menu_recs = []

# Run each patient/menu food pair through the function to calculate a score
for _, patient in patient_df.iterrows():
    for _, food in menu_df.iterrows():
        score = patient_suitability_score(patient, food)

        menu_recs.append({
            # Patient Columns
            'Patient_ID': patient['ID'],
            'HighBP': patient['HighBP'],
            'HighChol': patient['HighChol'],
            'CholCheck': patient['CholCheck'],
            'BMI': patient['BMI'],
            'Smoker': patient['Smoker'],
            'Stroke': patient['Stroke'],
            'HeartDiseaseorAttack': patient['HeartDiseaseorAttack'],
            'PhysActivity': patient['PhysActivity'],
            'GenHlth': patient['GenHlth'],
            'MentHlth': patient['MentHlth'],
            'PhysHlth': patient['PhysHlth'],
            'Sex': patient['Sex'],
            'Age': patient['Age'],
            'Glucose Value': patient['Glucose Value'],
            'Time Checked': patient['Time Checked'],
            'GlucoseRank': patient['GlucoseRank'],
            'Cluster': patient['Cluster'],

            # Food Columns
            'Restaurant': food['restaurant_name'],
            'Food_Name': food['food_name'],
            'Calories': food['calories'],
            'Carbohydrates': food['carbohydrates'],
            'Sugars': food['sugars'],
            'Fats': food['fats'],
            'Saturated_Fats': food['saturated_fats'],
            'Cholesterol': food['cholesterol'],
            'Sodium': food['sodium'],
            'Fiber': food['fiber'],
            'Potassium': food['potassium'],
            'Proteins': food['proteins'],
            'General_Score': food['score'],
            'Patient_Score': score
        })

# Convert to pandas DataFrame
menu_recs_df = pd.DataFrame(menu_recs)

In [28]:
# Display first 5 records
menu_recs_df.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.56,0.14,0.02,0.0,3.76,0.99,116.09,0.58,0.308,0.247
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,28.66,14.0,95.52,1176.09,2.39,443.77,30.27,0.519,0.247
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,26.56,10.52,122.04,791.0,0.03,569.52,34.28,0.498,0.203
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,17.25,0.0,0.0,0.0,0.84,0.04,10.92,0.06,0.166,0.148
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,3.88,0.85,23.6,175.6,0.03,52.8,2.56,0.135,0.07


#### Individual Food Patient Scores

In [32]:
# Initialize empty df for recommendations
individual_food_recs = []

# Run each patient/individual food pair through the function to calculate a score
for _, patient in patient_df.iterrows():
    for _, food in individual_foods_df.iterrows():
        score = patient_suitability_score(patient, food)

        individual_food_recs.append({
            # Patient Columns
            'Patient_ID': patient['ID'],
            'HighBP': patient['HighBP'],
            'HighChol': patient['HighChol'],
            'CholCheck': patient['CholCheck'],
            'BMI': patient['BMI'],
            'Smoker': patient['Smoker'],
            'Stroke': patient['Stroke'],
            'HeartDiseaseorAttack': patient['HeartDiseaseorAttack'],
            'PhysActivity': patient['PhysActivity'],
            'GenHlth': patient['GenHlth'],
            'MentHlth': patient['MentHlth'],
            'PhysHlth': patient['PhysHlth'],
            'Sex': patient['Sex'],
            'Age': patient['Age'],
            'Glucose Value': patient['Glucose Value'],
            'Time Checked': patient['Time Checked'],
            'GlucoseRank': patient['GlucoseRank'],
            'Cluster': patient['Cluster'],

            # Food Columns
            'Restaurant': None,
            'Food_Name': food['food_name'],
            'Calories': food['calories'],
            'Carbohydrates': food['carbohydrates'],
            'Sugars': food['sugars'],
            'Fats': food['fats'],
            'Saturated_Fats': None,
            'Cholesterol': None,
            'Sodium': None,
            'Fiber': food['fiber'],
            'Potassium': None,
            'Proteins': food['proteins'],
            'General_Score': food['score'],
            'Patient_Score': score
        })


# Convert to pandas DataFrame
individual_food_recs_df = pd.DataFrame(individual_food_recs)

In [34]:
# Display first 5 records
individual_food_recs_df.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,6.992,50.98,,,,11.0,,21.32,0.4612,0.418
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,6.348,52.08,,,,9.22,,21.18,0.4304,0.385
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,3.142,51.9,,,,11.9,,20.66,0.5462,0.392
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,10.48,0.0,,,,2.76,,0.554,0.1746,0.189
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2.076,0.0,,,,2.36,,2.076,0.296,0.164


#### Recommend Top 5 Meals and Top 5 Foods per Patient

In [36]:
# Group by Patient_ID and get the top 5 Patient_Score for menu recommendations
top_menu_recs = menu_recs_df.groupby('Patient_ID').apply(
    lambda group: group.nlargest(5, 'Patient_Score')
).reset_index(drop=True)

# Display the result
top_menu_recs.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.55,0.93,0.24,0.0,1.72,14.96,610.6,15.24,0.843,0.732
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.55,0.93,0.24,0.0,1.72,14.96,610.6,15.24,0.843,0.732
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.55,0.93,0.24,0.0,1.72,14.96,610.6,15.24,0.843,0.732
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.98,1.23,0.32,0.0,1632.96,29.34,1309.77,25.64,0.791,0.568
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,3.56,0.75,0.1,0.0,3.96,15.64,730.62,17.86,0.727,0.567


In [38]:
# Group by Patient_ID and get the top 5 Patient_Score for menu recommendations
top_foods_recs = individual_food_recs_df.groupby('Patient_ID').apply(
    lambda group: group.nlargest(5, 'Patient_Score')
).reset_index(drop=True)

# Display the result
top_foods_recs.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.726,0.702,,,,15.4,,19.804,0.7916,0.794
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.0,30.5,,,,33.3,,16.7,0.894,0.745
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.684,1.158,,,,9.12,,6.678,0.5796,0.719
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.44,2.296,,,,5.12,,10.208,0.7128,0.673
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.0,40.54,,,,27.44,,17.66,0.77,0.669


In [40]:
# Combine all food recommendations
final_recs = pd.concat([menu_recs_df, individual_food_recs_df], ignore_index=True)

# Display the result
final_recs.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.56,0.14,0.02,0.0,3.76,0.99,116.09,0.58,0.308,0.247
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,28.66,14.0,95.52,1176.09,2.39,443.77,30.27,0.519,0.247
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,26.56,10.52,122.04,791.0,0.03,569.52,34.28,0.498,0.203
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,17.25,0.0,0.0,0.0,0.84,0.04,10.92,0.06,0.166,0.148
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,3.88,0.85,23.6,175.6,0.03,52.8,2.56,0.135,0.07


In [48]:
# Save the Recommendations DataFrame as a CSV file in repo
data_dir = './preprocessed_data/'
zip_file_path = data_dir + 'final_recs.zip'

# Save the DataFrame in a zip file
with zipfile.ZipFile(zip_file_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    with io.StringIO() as csv_buffer:
        # Save DataFrame to the buffer as a CSV
        final_recs.to_csv(csv_buffer, index=False)
        # Write the buffer content to the zip file
        zipf.writestr('final_recs.csv', csv_buffer.getvalue())