# Scoring and Preparation for Modeling

### Imports

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Read in Preprocessed Data

In [3]:
# Restaurant Menu Data
menu_df = pd.read_csv('./Data Library/preprocessed_data/menu_df.csv')
menu_df.head()

Unnamed: 0,restaurant_name,food_name,serving_size,serving_unit,calories,carbohydrates,sugars,fats,saturated_fats,cholesterol,sodium,fiber,potassium,proteins,carb_percent,fat_percent,protein_percent,score
0,McDonald's Canada,Egg BLT McMuffin with Shredded Lettuce (McDona...,1.0,Serving,7.99,1.55,0.56,0.14,0.02,0.0,3.76,0.99,116.09,0.58,77.596996,15.769712,29.036295,0.308
1,McDonald's,Cheeseburger,1.0,Serving,535.31,39.24,7.16,28.66,14.0,95.52,1176.09,2.39,443.77,30.27,29.321328,48.185164,22.61867,0.519
2,McDonald's,Hamburger,1.0,Serving,540.14,40.27,7.16,26.56,10.52,122.04,791.0,0.03,569.52,34.28,29.821898,44.255193,25.386011,0.498
3,McDonald's,Honey,1.0,Serving,63.84,17.3,17.25,0.0,0.0,0.0,0.84,0.04,10.92,0.06,108.39599,0.0,0.37594,0.166
4,McDonald's,Hotcakes,1.0,Serving,90.8,11.32,7.16,3.88,0.85,23.6,175.6,0.03,52.8,2.56,49.867841,38.45815,11.277533,0.135


In [13]:
# Individual Food Data
individual_foods_df = pd.read_csv('./Data Library/preprocessed_data/individual_foods_df.csv')

individual_foods_df.head()

Unnamed: 0,food_name,category,description,food_category,calories,carbohydrates,fiber,sugars,fats,proteins,score
0,Almond Butter,Individual Foods,ALMOND BUTTER,Nut & Seed Butters,587.8,22.5,11.0,6.992,50.98,21.32,0.4612
1,Almond Flour,Individual Foods,ALMOND FLOUR,Flours & Corn Meal,604.8,19.64,9.22,6.348,52.08,21.18,0.4304
2,Almonds,Individual Foods,ALMONDS,"Popcorn, Peanuts, Seeds & Related Snacks",594.8,18.12,11.9,3.142,51.9,20.66,0.5462
3,Apples,Individual Foods,APPLES,Pre-Packaged Fruit & Vegetables,52.8,14.18,2.76,10.48,0.0,0.554,0.1746
4,Asparagus,Individual Foods,ASPARAGUS,Pre-Packaged Fruit & Vegetables,21.0,3.92,2.36,2.076,0.0,2.076,0.296


In [14]:
# Patient Data
patient_df = pd.read_csv('./Data Library/preprocessed_data/patient_df.csv')
patient_df.head()

Unnamed: 0,ID,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,MentHlth,PhysHlth,Sex,Age,Glucose Value,Time Checked,GlucoseRank,Cluster
0,0,2.0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,19.0,16.0,1,0.333333,96,2024-09-19T18:28:15,Norm,2
1,1,2.0,1,1.0,1,26.0,1.0,0.0,0.0,1,4.0,0.0,15.0,1,1.0,138,2024-10-30T12:38:16,High,2
2,2,2.0,0,1.0,0,38.0,1.0,0.0,0.0,0,4.0,30.0,30.0,1,0.833333,148,2024-09-23T18:23:26,High,1
3,3,2.0,0,0.0,0,24.0,0.0,0.0,0.0,1,2.0,0.0,10.0,1,0.833333,99,2024-10-31T04:58:18,Norm,2
4,4,2.0,0,1.0,1,21.0,1.0,0.0,1.0,1,3.0,0.0,0.0,0,0.75,161,2024-09-25T15:08:29,High,2


### Compute Scores for Recommendations

While we already have a score that captures the suitability of a meal/food item based on general diabetes nutrition recommendations, this patient-specific score uses that general score in combination with personalized factors to suggest the best meal options for a person.

In [15]:
# # Patient-Specific Score Function
def patient_suitability_score(patient_data, food_data):
    # Patient Info
    glucose_rank = patient_data['GlucoseRank']
    high_bp = patient_data['HighBP']
    high_chol = patient_data['HighChol']
    smoker = patient_data['Smoker']
    stroke = patient_data['Stroke']
    heart = patient_data['HeartDiseaseorAttack']
    bmi = patient_data['BMI']

    # Food Info
    carbohydrates = food_data['carbohydrates']
    sugars = food_data['sugars']
    fiber = food_data['fiber']
    proteins = food_data['proteins']
    # None for individual food, where sodium isn't available
    sodium = food_data.get('sodium', None)
    fat = food_data['fats']

    # Base weights
    base_weight = np.array([0.45, 0.2, 0.15, 0.1, 0.05, 0.05])
    # Base nutrition goals (carbs, sugar, fiber, protein, sodium, fat)
    goals = np.array([52.5, 7.5, 7.5, 20, 765, 25])

    # Glucose Level Adjustments
    if glucose_rank == 'High':
        goals[1] = 2.5 
    elif glucose_rank == 'Low':
        goals[1] = 1.5

    # Adjustments for high blood pressure
    if high_bp:
        goals[1] = 2.5  # Adjust sugar goal
        goals[4] = 500  # Adjust sodium goal
        base_weight += np.array([0.3, 0.25, 0.15, 0.05, 0.2, 0.05])

    # Adjustments for high cholesterol
    if high_chol:
        goals[1] = 2.5  # Adjust sugar goal
        goals[2] = 10   # Adjust fiber goal
        goals[5] = 15   # Adjust fat goal
        base_weight += np.array([0.3, 0.25, 0.25, 0.1, 0.05, 0.05])

    # Adjustments for smokers
    if smoker:
        base_weight += np.array([0.25, 0.25, 0.25, 0.1, 0.1, 0.05])

    # Adjustments for stroke or heart attacks
    if stroke or heart:
        base_weight += np.array([0.25, 0.15, 0.25, 0.1, 0.15, 0.1])

    # Average Weights
    avg_weights = base_weight / base_weight.sum()

    # Scores Calculation
    scores = np.array([
        1 - abs(carbohydrates - goals[0]) / goals[0],  
        1 - sugars / goals[1],                        
        min(fiber / goals[2], 1),                     
        min(proteins / goals[3], 1),                  
        1 - (sodium / goals[4]) if sodium is not None else 1,  
        1 - fat / goals[5]                            
    ])

    scores = np.maximum(scores, 0)

    # Weighted Score
    final_score = (avg_weights * scores).sum()

    # BMI Penalty
    bmi_penalty = 0.025 * (bmi - 25)
    final_score = max(0, final_score - bmi_penalty)

    return round(final_score, 3)

#### Menu Data Patient Scores

In [None]:
# Initialize empty df for recommendations
menu_recs = []

# Run each patient/menu food pair through the function to calculate a score
for _, patient in patient_df.iterrows():
    for _, food in menu_df.iterrows():
        score = patient_suitability_score(patient, food)

        menu_recs.append({
            # Patient Columns
            'Patient_ID': patient['ID'],
            'HighBP': patient['HighBP'],
            'HighChol': patient['HighChol'],
            'CholCheck': patient['CholCheck'],
            'BMI': patient['BMI'],
            'Smoker': patient['Smoker'],
            'Stroke': patient['Stroke'],
            'HeartDiseaseorAttack': patient['HeartDiseaseorAttack'],
            'PhysActivity': patient['PhysActivity'],
            'GenHlth': patient['GenHlth'],
            'MentHlth': patient['MentHlth'],
            'PhysHlth': patient['PhysHlth'],
            'Sex': patient['Sex'],
            'Age': patient['Age'],
            'Glucose Value': patient['Glucose Value'],
            'Time Checked': patient['Time Checked'],
            'GlucoseRank': patient['GlucoseRank'],
            'Cluster': patient['Cluster'],

            # Food Columns
            'Restaurant': food['restaurant_name'],
            'Food_Name': food['food_name'],
            'Calories': food['calories'],
            'Carbohydrates': food['carbohydrates'],
            'Sugars': food['sugars'],
            'Fats': food['fats'],
            'Saturated_Fats': food['saturated_fats'],
            'Cholesterol': food['cholesterol'],
            'Sodium': food['sodium'],
            'Fiber': food['fiber'],
            'Potassium': food['potassium'],
            'Proteins': food['proteins'],
            'General_Score': food['score'],
            'Patient_Score': score
        })

# Convert to pandas DataFrame
menu_recs_df = pd.DataFrame(menu_recs)

In [22]:
# Display first 5 records
menu_recs_df.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.56,0.14,0.02,0.0,3.76,0.99,116.09,0.58,0.308,0.247
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,28.66,14.0,95.52,1176.09,2.39,443.77,30.27,0.519,0.247
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,26.56,10.52,122.04,791.0,0.03,569.52,34.28,0.498,0.203
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,17.25,0.0,0.0,0.0,0.84,0.04,10.92,0.06,0.166,0.148
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,7.16,3.88,0.85,23.6,175.6,0.03,52.8,2.56,0.135,0.07


#### Individual Food Patient Scores

In [16]:
# Initialize empty df for recommendations
individual_food_recs = []

# Run each patient/individual food pair through the function to calculate a score
for _, patient in patient_df.iterrows():
    for _, food in individual_foods_df.iterrows():
        score = patient_suitability_score(patient, food)

        individual_food_recs.append({
            # Patient Columns
            'Patient_ID': patient['ID'],
            'HighBP': patient['HighBP'],
            'HighChol': patient['HighChol'],
            'CholCheck': patient['CholCheck'],
            'BMI': patient['BMI'],
            'Smoker': patient['Smoker'],
            'Stroke': patient['Stroke'],
            'HeartDiseaseorAttack': patient['HeartDiseaseorAttack'],
            'PhysActivity': patient['PhysActivity'],
            'GenHlth': patient['GenHlth'],
            'MentHlth': patient['MentHlth'],
            'PhysHlth': patient['PhysHlth'],
            'Sex': patient['Sex'],
            'Age': patient['Age'],
            'Glucose Value': patient['Glucose Value'],
            'Time Checked': patient['Time Checked'],
            'GlucoseRank': patient['GlucoseRank'],
            'Cluster': patient['Cluster'],

            # Food Columns
            'Food_Name': food['food_name'],
            'Calories': food['calories'],
            'Carbohydrates': food['carbohydrates'],
            'Sugars': food['sugars'],
            'Fats': food['fats'],
            'Fiber': food['fiber'],
            'Proteins': food['proteins'],
            'General_Score': food['score'],
            'Patient_Score': score
        })


# Convert to pandas DataFrame
individual_food_recs_df = pd.DataFrame(individual_food_recs)

In [17]:
# Display first 5 records
individual_food_recs_df.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Cluster,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Almond Butter,587.8,22.5,6.992,50.98,11.0,21.32,0.4612,0.418
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Almond Flour,604.8,19.64,6.348,52.08,9.22,21.18,0.4304,0.385
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Almonds,594.8,18.12,3.142,51.9,11.9,20.66,0.5462,0.392
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Apples,52.8,14.18,10.48,0.0,2.76,0.554,0.1746,0.189
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Asparagus,21.0,3.92,2.076,0.0,2.36,2.076,0.296,0.164


#### Recommend Top 5 Meals and Top 5 Foods per Patient

In [36]:
# Group by Patient_ID and get the top 5 Patient_Score for menu recommendations
top_menu_recs = menu_recs_df.groupby('Patient_ID').apply(
    lambda group: group.nlargest(5, 'Patient_Score')
).reset_index(drop=True)

# Display the result
top_menu_recs.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.55,0.93,0.24,0.0,1.72,14.96,610.6,15.24,0.843,0.732
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.55,0.93,0.24,0.0,1.72,14.96,610.6,15.24,0.843,0.732
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.55,0.93,0.24,0.0,1.72,14.96,610.6,15.24,0.843,0.732
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,0.98,1.23,0.32,0.0,1632.96,29.34,1309.77,25.64,0.791,0.568
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,3.56,0.75,0.1,0.0,3.96,15.64,730.62,17.86,0.727,0.567


In [18]:
# Group by Patient_ID and get the top 5 Patient_Score for menu recommendations
top_foods_recs = individual_food_recs_df.groupby('Patient_ID').apply(
    lambda group: group.nlargest(5, 'Patient_Score')
).reset_index(drop=True)

# Display the result
top_foods_recs.head()

  top_foods_recs = individual_food_recs_df.groupby('Patient_ID').apply(


Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Cluster,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score
0,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Lentils,233.0,50.18,0.726,0.702,15.4,19.804,0.7916,0.794
1,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Chia seeds,488.0,43.1,0.0,30.5,33.3,16.7,0.894,0.745
2,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Barley,593.5,50.38,0.684,1.158,9.12,6.678,0.5796,0.719
3,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Farro,263.4,53.78,0.44,2.296,5.12,10.208,0.7128,0.673
4,0,1,1.0,0,30.0,1.0,0.0,1.0,0,5.0,...,2,Flax seeds,487.6,29.58,0.0,40.54,27.44,17.66,0.77,0.669


In [32]:
# Stratify Menu Recommendation Data

# Perform stratified sampling
_, menu_recs_samp = train_test_split(
    menu_recs_df,
    test_size=0.01,
    stratify=menu_recs_df['Patient_ID'],
    random_state=42)

# Print sample head
print(menu_recs_samp.shape)
menu_recs_samp.head()

(304754, 32)


Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
2957113,2290,1,0.0,1,36.0,1.0,0.0,0.0,1,4.0,...,1.45,28.07,12.26,230.14,1522.31,2.68,835.55,28.72,0.69,0.18
4598441,3561,1,1.0,0,27.0,1.0,1.0,1.0,1,4.0,...,0.09,11.76,2.7,46.75,78.97,0.1,364.69,20.63,0.6,0.485
19891142,15407,1,1.0,1,25.0,0.0,0.0,1.0,1,2.0,...,47.88,40.8,8.6,0.0,915.46,4.91,340.34,11.07,0.121,0.147
12000552,9295,0,1.0,0,35.0,1.0,1.0,1.0,1,5.0,...,4.6,4.05,2.22,6.36,8.16,0.3,35.64,0.75,0.175,0.0
16939183,13120,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,1.9,10.34,2.83,7.34,626.54,10.89,628.21,10.82,0.682,0.389


In [19]:
# Stratify Individual Food Recommendation Data

# Perform stratified sampling
_, individual_foods_samp = train_test_split(
    individual_food_recs_df,
    test_size=0.01,
    stratify=individual_food_recs_df['Patient_ID'],
    random_state=42)

# Print sample head
print(individual_foods_samp.shape)
individual_foods_samp.head()

(32577, 27)


Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Cluster,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score
2996667,21714,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,0,Wild Rice,318.8,67.38,1.002,1.012,5.38,12.36,0.6688,0.539
798839,5788,1,1.0,1,32.0,1.0,0.0,1.0,1,4.0,...,2,Pumpkin seeds,591.75,12.475,0.8325,49.175,5.825,29.975,0.55475,0.368
589253,4269,0,0.0,1,34.0,0.0,0.0,1.0,1,4.0,...,1,White Rice,351.8,80.24,0.0,0.0,0.78,7.098,0.5816,0.346
1471899,10665,1,1.0,0,36.0,0.0,1.0,1.0,0,4.0,...,3,Wheat Bread,262.8,50.2,5.078,3.216,3.52,10.166,0.6712,0.312
3061159,22182,1,1.0,0,26.0,1.0,1.0,1.0,0,5.0,...,2,Eggplant,38.4,7.584,3.328,0.292,2.59,0.8784,0.2782,0.247


In [None]:
# Save the Recommendations DataFrames as a CSV file in repo
data_dir = './Data Library/preprocessed_data/'
menu_recs_samp.to_csv(data_dir + 'menu_recs_samp.csv', index=False)
individual_foods_samp.to_csv(data_dir + 'individual_foods_samp.csv', index=False)