# Modeling

### Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
import os
os.chdir("C:/Users/tarad/OneDrive/Documents/USD_GRAD_SCHOOL-C/ADS599_CaptsoneProject/CapStoneProject/MADS-Capstone")


### Loading Data

In [3]:
# Load menu food CSV file
menu_recs_samp = pd.read_csv('./preprocessed_data/menu_recs_samp.csv')
menu_recs_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,2290,1,0.0,1,36.0,1.0,0.0,0.0,1,4.0,...,1.45,28.07,12.26,230.14,1522.31,2.68,835.55,28.72,0.69,0.18
1,3561,1,1.0,0,27.0,1.0,1.0,1.0,1,4.0,...,0.09,11.76,2.7,46.75,78.97,0.1,364.69,20.63,0.6,0.485
2,15407,1,1.0,1,25.0,0.0,0.0,1.0,1,2.0,...,47.88,40.8,8.6,0.0,915.46,4.91,340.34,11.07,0.121,0.147
3,9295,0,1.0,0,35.0,1.0,1.0,1.0,1,5.0,...,4.6,4.05,2.22,6.36,8.16,0.3,35.64,0.75,0.175,0.0
4,13120,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,1.9,10.34,2.83,7.34,626.54,10.89,628.21,10.82,0.682,0.389


In [4]:
# Load individual food CSV file
individual_foods_samp = pd.read_csv('./preprocessed_data/individual_foods_samp.csv')
individual_foods_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Cluster,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score
0,21714,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,0,Wild Rice,318.8,67.38,1.002,1.012,5.38,12.36,0.6688,0.539
1,5788,1,1.0,1,32.0,1.0,0.0,1.0,1,4.0,...,2,Pumpkin seeds,591.75,12.475,0.8325,49.175,5.825,29.975,0.55475,0.368
2,4269,0,0.0,1,34.0,0.0,0.0,1.0,1,4.0,...,1,White Rice,351.8,80.24,0.0,0.0,0.78,7.098,0.5816,0.346
3,10665,1,1.0,0,36.0,0.0,1.0,1.0,0,4.0,...,3,Wheat Bread,262.8,50.2,5.078,3.216,3.52,10.166,0.6712,0.312
4,22182,1,1.0,0,26.0,1.0,1.0,1.0,0,5.0,...,2,Eggplant,38.4,7.584,3.328,0.292,2.59,0.8784,0.2782,0.247


### Recommendations Column

In [5]:
# Calculate the median of Patient_Score for menu foods 
median_score = menu_recs_samp['Patient_Score'].median()

print(f"The median Patient_Score is: {median_score}")

The median Patient_Score is: 0.222


In [6]:
# Create a new column for recommendation labels for menu foods
menu_recs_samp['Recommendation'] = menu_recs_samp['Patient_Score'].apply(
    lambda x: 'Recommended' if x >= median_score else 'Not Recommended'
)

# Check the updated dataset
menu_recs_samp.head()


Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score,Recommendation
0,2290,1,0.0,1,36.0,1.0,0.0,0.0,1,4.0,...,28.07,12.26,230.14,1522.31,2.68,835.55,28.72,0.69,0.18,Not Recommended
1,3561,1,1.0,0,27.0,1.0,1.0,1.0,1,4.0,...,11.76,2.7,46.75,78.97,0.1,364.69,20.63,0.6,0.485,Recommended
2,15407,1,1.0,1,25.0,0.0,0.0,1.0,1,2.0,...,40.8,8.6,0.0,915.46,4.91,340.34,11.07,0.121,0.147,Not Recommended
3,9295,0,1.0,0,35.0,1.0,1.0,1.0,1,5.0,...,4.05,2.22,6.36,8.16,0.3,35.64,0.75,0.175,0.0,Not Recommended
4,13120,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,10.34,2.83,7.34,626.54,10.89,628.21,10.82,0.682,0.389,Recommended


In [7]:
# Calculate the median of Patient_Score for individual foods
median_score_individual = individual_foods_samp['Patient_Score'].median()

print(f"The median Patient_Score for individual foods is: {median_score_individual}")


The median Patient_Score for individual foods is: 0.289


In [8]:
# Create a new column for recommendation labels for indivdual foods
individual_foods_samp['Recommendation'] = individual_foods_samp['Patient_Score'].apply(
    lambda x: 'Recommended' if x >= median_score_individual else 'Not Recommended'
)

# Check the updated dataset
individual_foods_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score,Recommendation
0,21714,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,Wild Rice,318.8,67.38,1.002,1.012,5.38,12.36,0.6688,0.539,Recommended
1,5788,1,1.0,1,32.0,1.0,0.0,1.0,1,4.0,...,Pumpkin seeds,591.75,12.475,0.8325,49.175,5.825,29.975,0.55475,0.368,Recommended
2,4269,0,0.0,1,34.0,0.0,0.0,1.0,1,4.0,...,White Rice,351.8,80.24,0.0,0.0,0.78,7.098,0.5816,0.346,Recommended
3,10665,1,1.0,0,36.0,0.0,1.0,1.0,0,4.0,...,Wheat Bread,262.8,50.2,5.078,3.216,3.52,10.166,0.6712,0.312,Recommended
4,22182,1,1.0,0,26.0,1.0,1.0,1.0,0,5.0,...,Eggplant,38.4,7.584,3.328,0.292,2.59,0.8784,0.2782,0.247,Not Recommended


### Train-Test Split for Menu Recommendations Dataset

In [9]:
# Features and target for menu dataset
X_menu = menu_recs_samp.drop(columns=['Recommendation', 'Patient_Score'])  # Drop labels and score
y_menu = menu_recs_samp['Recommendation']  # Use labels as target

# Train-test split
X_menu_train, X_menu_test, y_menu_train, y_menu_test = train_test_split(
    X_menu, y_menu, test_size=0.2, stratify=y_menu, random_state=42
)

### Train-Test Split for Individual Food Recommendations Dataset

In [10]:
# Features and target for individual foods dataset
X_food = individual_foods_samp.drop(columns=['Recommendation', 'Patient_Score'])  # Drop labels and score
y_food = individual_foods_samp['Recommendation']  # Use labels as target

# Train-test split
X_food_train, X_food_test, y_food_train, y_food_test = train_test_split(
    X_food, y_food, test_size=0.2, stratify=y_food, random_state=42
)


### Feature Engineering

In [11]:
# Feature Engineering for Menu and Individual Food Datasets

# Function to process datetime columns and extract useful features
def process_time_column(df):
    df['Time Checked'] = pd.to_datetime(df['Time Checked'])
    df['Hour'] = df['Time Checked'].dt.hour
    df['Day'] = df['Time Checked'].dt.day
    df['Month'] = df['Time Checked'].dt.month
    df['Weekday'] = df['Time Checked'].dt.weekday
    df = df.drop(columns=['Time Checked'])  # Drop the original datetime column
    return df

# Function to handle categorical columns with one-hot encoding
def encode_categorical_columns(df, categorical_columns):
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
    return df

# Function to preprocess features for train and test datasets
def preprocess_features(X_train, X_test, categorical_columns, drop_columns):
    # Process datetime column
    if 'Time Checked' in X_train.columns:
        X_train = process_time_column(X_train)
        X_test = process_time_column(X_test)

    # One-hot encode categorical columns
    X_train = encode_categorical_columns(X_train, categorical_columns)
    X_test = encode_categorical_columns(X_test, categorical_columns)

    # Align train and test datasets to ensure consistent columns
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Drop irrelevant columns
    X_train = X_train.drop(columns=drop_columns, errors='ignore')
    X_test = X_test.drop(columns=drop_columns, errors='ignore')

    return X_train, X_test

# Define categorical and irrelevant columns for preprocessing
categorical_menu = ['GlucoseRank']
irrelevant_menu = ['Patient_ID', 'Food_Name']

categorical_food = ['GlucoseRank']
irrelevant_food = ['Patient_ID', 'Food_Name']

# Preprocess Menu Dataset
X_menu_train, X_menu_test = preprocess_features(
    X_menu_train, X_menu_test, categorical_menu, irrelevant_menu
)

# Preprocess Individual Foods Dataset
X_food_train, X_food_test = preprocess_features(
    X_food_train, X_food_test, categorical_food, irrelevant_food
)

### Individual Foods Modeling

In [12]:
# Label encode the target variable
label_encoder = LabelEncoder()
y_food_train_encoded = label_encoder.fit_transform(y_food_train)
y_food_test_encoded = label_encoder.transform(y_food_test)

# Scale the features
scaler = StandardScaler()
X_food_train_scaled = scaler.fit_transform(X_food_train)
X_food_test_scaled = scaler.transform(X_food_test)

In [13]:
# Define a function to train and evaluate multiple models
def evaluate_models(X_train, X_test, y_train, y_test, label_encoder):
    models = {
        "Logistic Regression": LogisticRegression(random_state=42, max_iter=500),
        "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
        "Support Vector Machine": SVC(kernel='rbf', random_state=42),
        "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
        "Neural Network": MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=300, random_state=42)
    }
    
    # Loop through each model, train, and evaluate
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        
        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on the test set
        y_pred = model.predict(X_test)
        
        # If the labels were encoded, decode them back for readability
        if label_encoder is not None:
            y_pred = label_encoder.inverse_transform(y_pred)
            y_test_decoded = label_encoder.inverse_transform(y_test)
        else:
            y_test_decoded = y_test
        
        # Print the classification report
        print(f"\nClassification Report for {model_name}:")
        print(classification_report(y_test_decoded, y_pred))
        print("=" * 50)

# Call the function to evaluate models for the individual foods dataset
evaluate_models(
    X_food_train_scaled, 
    X_food_test_scaled, 
    y_food_train_encoded, 
    y_food_test_encoded, 
    label_encoder
)


Evaluating Logistic Regression...

Classification Report for Logistic Regression:
                 precision    recall  f1-score   support

Not Recommended       0.92      0.93      0.92      3256
    Recommended       0.93      0.92      0.92      3260

       accuracy                           0.92      6516
      macro avg       0.92      0.92      0.92      6516
   weighted avg       0.92      0.92      0.92      6516

Evaluating Random Forest...

Classification Report for Random Forest:
                 precision    recall  f1-score   support

Not Recommended       0.98      0.98      0.98      3256
    Recommended       0.98      0.98      0.98      3260

       accuracy                           0.98      6516
      macro avg       0.98      0.98      0.98      6516
   weighted avg       0.98      0.98      0.98      6516

Evaluating Support Vector Machine...

Classification Report for Support Vector Machine:
                 precision    recall  f1-score   support

Not Recommen