# Modeling

### Imports

In [31]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, mean_squared_error, r2_score
from xgboost import XGBClassifier, XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.svm import SVC

In [32]:
import os
os.chdir("C:/Users/tarad/OneDrive/Documents/USD_GRAD_SCHOOL-C/ADS599_CaptsoneProject/CapStoneProject/MADS-Capstone")


### Loading Data

In [33]:
# Load menu food CSV file
menu_recs_samp = pd.read_csv('./preprocessed_data/menu_recs_samp.csv')
menu_recs_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Sugars,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score
0,2290,1,0.0,1,36.0,1.0,0.0,0.0,1,4.0,...,1.45,28.07,12.26,230.14,1522.31,2.68,835.55,28.72,0.69,0.18
1,3561,1,1.0,0,27.0,1.0,1.0,1.0,1,4.0,...,0.09,11.76,2.7,46.75,78.97,0.1,364.69,20.63,0.6,0.485
2,15407,1,1.0,1,25.0,0.0,0.0,1.0,1,2.0,...,47.88,40.8,8.6,0.0,915.46,4.91,340.34,11.07,0.121,0.147
3,9295,0,1.0,0,35.0,1.0,1.0,1.0,1,5.0,...,4.6,4.05,2.22,6.36,8.16,0.3,35.64,0.75,0.175,0.0
4,13120,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,1.9,10.34,2.83,7.34,626.54,10.89,628.21,10.82,0.682,0.389


In [34]:
# Load individual food CSV file
individual_foods_samp = pd.read_csv('./preprocessed_data/individual_foods_samp.csv')
individual_foods_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Cluster,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score
0,21714,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,0,Wild Rice,318.8,67.38,1.002,1.012,5.38,12.36,0.6688,0.539
1,5788,1,1.0,1,32.0,1.0,0.0,1.0,1,4.0,...,2,Pumpkin seeds,591.75,12.475,0.8325,49.175,5.825,29.975,0.55475,0.368
2,4269,0,0.0,1,34.0,0.0,0.0,1.0,1,4.0,...,1,White Rice,351.8,80.24,0.0,0.0,0.78,7.098,0.5816,0.346
3,10665,1,1.0,0,36.0,0.0,1.0,1.0,0,4.0,...,3,Wheat Bread,262.8,50.2,5.078,3.216,3.52,10.166,0.6712,0.312
4,22182,1,1.0,0,26.0,1.0,1.0,1.0,0,5.0,...,2,Eggplant,38.4,7.584,3.328,0.292,2.59,0.8784,0.2782,0.247


### Recommendations Column

In [35]:
# Calculate medians for classification
median_score_menu = menu_recs_samp['Patient_Score'].median()
median_score_food = individual_foods_samp['Patient_Score'].median()

# Add recommendation labels based on medians
menu_recs_samp['Recommendation'] = menu_recs_samp['Patient_Score'].apply(
    lambda x: 'Recommended' if x >= median_score_menu else 'Not Recommended'
)
individual_foods_samp['Recommendation'] = individual_foods_samp['Patient_Score'].apply(
    lambda x: 'Recommended' if x >= median_score_food else 'Not Recommended'
)

Check the updated dataset

In [36]:
menu_recs_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Fats,Saturated_Fats,Cholesterol,Sodium,Fiber,Potassium,Proteins,General_Score,Patient_Score,Recommendation
0,2290,1,0.0,1,36.0,1.0,0.0,0.0,1,4.0,...,28.07,12.26,230.14,1522.31,2.68,835.55,28.72,0.69,0.18,Not Recommended
1,3561,1,1.0,0,27.0,1.0,1.0,1.0,1,4.0,...,11.76,2.7,46.75,78.97,0.1,364.69,20.63,0.6,0.485,Recommended
2,15407,1,1.0,1,25.0,0.0,0.0,1.0,1,2.0,...,40.8,8.6,0.0,915.46,4.91,340.34,11.07,0.121,0.147,Not Recommended
3,9295,0,1.0,0,35.0,1.0,1.0,1.0,1,5.0,...,4.05,2.22,6.36,8.16,0.3,35.64,0.75,0.175,0.0,Not Recommended
4,13120,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,10.34,2.83,7.34,626.54,10.89,628.21,10.82,0.682,0.389,Recommended


In [37]:
individual_foods_samp.head()

Unnamed: 0,Patient_ID,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,GenHlth,...,Food_Name,Calories,Carbohydrates,Sugars,Fats,Fiber,Proteins,General_Score,Patient_Score,Recommendation
0,21714,0,0.0,0,32.0,1.0,0.0,0.0,1,3.0,...,Wild Rice,318.8,67.38,1.002,1.012,5.38,12.36,0.6688,0.539,Recommended
1,5788,1,1.0,1,32.0,1.0,0.0,1.0,1,4.0,...,Pumpkin seeds,591.75,12.475,0.8325,49.175,5.825,29.975,0.55475,0.368,Recommended
2,4269,0,0.0,1,34.0,0.0,0.0,1.0,1,4.0,...,White Rice,351.8,80.24,0.0,0.0,0.78,7.098,0.5816,0.346,Recommended
3,10665,1,1.0,0,36.0,0.0,1.0,1.0,0,4.0,...,Wheat Bread,262.8,50.2,5.078,3.216,3.52,10.166,0.6712,0.312,Recommended
4,22182,1,1.0,0,26.0,1.0,1.0,1.0,0,5.0,...,Eggplant,38.4,7.584,3.328,0.292,2.59,0.8784,0.2782,0.247,Not Recommended


### Train-Test Split for Menu Recommendations Dataset

In [38]:
# Features and targets for classification
X_menu = menu_recs_samp.drop(columns=['Patient_Score', 'Recommendation'])
y_menu_class = menu_recs_samp['Recommendation']

# Features and targets for regression
y_menu_reg = menu_recs_samp['Patient_Score']

# Train-test split
X_menu_train, X_menu_test, y_menu_class_train, y_menu_class_test = train_test_split(
    X_menu, y_menu_class, test_size=0.2, stratify=y_menu_class, random_state=42
)
X_menu_reg_train, X_menu_reg_test, y_menu_reg_train, y_menu_reg_test = train_test_split(
    X_menu, y_menu_reg, test_size=0.2, random_state=42
)

### Train-Test Split for Individual Food Recommendations Dataset

In [39]:
# Features and targets for classification
X_food = individual_foods_samp.drop(columns=['Patient_Score', 'Recommendation'])
y_food_class = individual_foods_samp['Recommendation']

# Features and targets for regression
y_food_reg = individual_foods_samp['Patient_Score']

# Train-test split
X_food_train, X_food_test, y_food_class_train, y_food_class_test = train_test_split(
    X_food, y_food_class, test_size=0.2, stratify=y_food_class, random_state=42
)
X_food_reg_train, X_food_reg_test, y_food_reg_train, y_food_reg_test = train_test_split(
    X_food, y_food_reg, test_size=0.2, random_state=42
)

### Preprocessing

In [40]:
# Function to process datetime columns and extract useful features
def process_time_column(df):
    df['Time Checked'] = pd.to_datetime(df['Time Checked'])
    df['Hour'] = df['Time Checked'].dt.hour
    df['Day'] = df['Time Checked'].dt.day
    df['Month'] = df['Time Checked'].dt.month
    df['Weekday'] = df['Time Checked'].dt.weekday
    df = df.drop(columns=['Time Checked'])  # Drop the original datetime column
    return df

# Function to handle categorical columns with one-hot encoding
def encode_categorical_columns(df, categorical_columns):
    df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)
    return df

# Function to preprocess features for train and test datasets
def preprocess_features(X_train, X_test, categorical_columns, drop_columns):
    # Process datetime column
    if 'Time Checked' in X_train.columns:
        X_train = process_time_column(X_train)
        X_test = process_time_column(X_test)

    # One-hot encode categorical columns
    X_train = encode_categorical_columns(X_train, categorical_columns)
    X_test = encode_categorical_columns(X_test, categorical_columns)

    # Align train and test datasets to ensure consistent columns
    X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

    # Drop irrelevant columns
    X_train = X_train.drop(columns=drop_columns, errors='ignore')
    X_test = X_test.drop(columns=drop_columns, errors='ignore')

    return X_train, X_test

# Define categorical and irrelevant columns for preprocessing
categorical_food = ['GlucoseRank']
irrelevant_food = ['Food_Name']

categorical_menu = ['GlucoseRank']
irrelevant_menu = ['Food_Name', 'Restaurant']

# Preprocess Individual Foods Dataset
X_food_train, X_food_test = preprocess_features(
    X_food_train, X_food_test, categorical_food, irrelevant_food
)

# Preprocess Menu Recommendations Dataset
X_menu_train, X_menu_test = preprocess_features(
    X_menu_train, X_menu_test, categorical_menu, irrelevant_menu
)


### Feature Scaling

In [41]:
# Initialize scaler
scaler = StandardScaler()

# Scale individual foods
X_food_train_scaled = scaler.fit_transform(X_food_train)
X_food_test_scaled = scaler.transform(X_food_test)

# Scale menu recommendations
X_menu_train_scaled = scaler.fit_transform(X_menu_train)
X_menu_test_scaled = scaler.transform(X_menu_test)

### Regression and Classification Functions

In [42]:
def train_and_evaluate_regression(X_train, X_test, y_train, y_test, dataset_name):
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest Regressor': RandomForestRegressor(random_state=42),
        'XGBoost Regressor': XGBRegressor(objective='reg:squarederror', random_state=42)
    }
    
    # Add Support Vector Regressor conditionally
    if dataset_name == 'Individual Foods':
        models['Support Vector Regressor'] = SVR(kernel='rbf')
    elif dataset_name == 'Menu Recommendations' and len(X_train) <= 50000:
        models['Support Vector Regressor'] = SVR(kernel='rbf')
    
    for model_name, model in models.items():
        print(f"Training {model_name} on {dataset_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name}:")
        print(f"  Mean Squared Error: {mse:.4f}")
        print(f"  R^2: {r2:.4f}")
        print('-' * 40)

In [43]:
# Classification evaluation function
def train_and_evaluate_classification(X_train, X_test, y_train, y_test, dataset_name):
    models = {
        'Logistic Regression': LogisticRegression(random_state=42, max_iter=500),
        'Random Forest': RandomForestClassifier(random_state=42),
        'XGBoost': XGBClassifier(eval_metric='logloss', random_state=42),
        'Neural Network': MLPClassifier(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=300, random_state=42),
    }
    
    # Add SVC only for smaller datasets
    if dataset_name == 'Individual Foods':
        models['Support Vector Classifier'] = SVC(kernel='rbf', random_state=42)
    elif dataset_name == 'Menu Recommendations' and len(X_train) <= 50000:
        models['Support Vector Classifier'] = SVC(kernel='rbf', random_state=42)

    for model_name, model in models.items():
        print(f"Training {model_name} on {dataset_name}...")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\nClassification Report for {model_name} ({dataset_name}):")
        print(classification_report(y_test, y_pred))
        print("=" * 50)


### Model Evaluations

#### Menu Recommendations

In [44]:
# Encode target variable for classification
label_encoder_menu = LabelEncoder()
y_menu_class_train_encoded = label_encoder_menu.fit_transform(y_menu_class_train)
y_menu_class_test_encoded = label_encoder_menu.transform(y_menu_class_test)

In [45]:
# Regression
print("Regression - Menu Recommendations:")
train_and_evaluate_regression(X_menu_train_scaled, X_menu_test_scaled, y_menu_reg_train, y_menu_reg_test, "Menu Recommendations")

Regression - Menu Recommendations:
Training Linear Regression on Menu Recommendations...
Linear Regression:
  Mean Squared Error: 0.0274
  R^2: -0.0001
----------------------------------------
Training Random Forest Regressor on Menu Recommendations...
Random Forest Regressor:
  Mean Squared Error: 0.0295
  R^2: -0.0760
----------------------------------------
Training XGBoost Regressor on Menu Recommendations...
XGBoost Regressor:
  Mean Squared Error: 0.0278
  R^2: -0.0149
----------------------------------------


In [46]:
# Classification
# Call the function for Menu Recommendations
print("Classification - Menu Recommendations:")
train_and_evaluate_classification(X_menu_train_scaled, X_menu_test_scaled, y_menu_class_train_encoded, y_menu_class_test_encoded, "Menu Recommendations")

Classification - Menu Recommendations:
Training Logistic Regression on Menu Recommendations...

Classification Report for Logistic Regression (Menu Recommendations):
              precision    recall  f1-score   support

           0       0.89      0.89      0.89     30427
           1       0.89      0.90      0.89     30524

    accuracy                           0.89     60951
   macro avg       0.89      0.89      0.89     60951
weighted avg       0.89      0.89      0.89     60951

Training Random Forest on Menu Recommendations...

Classification Report for Random Forest (Menu Recommendations):
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     30427
           1       0.99      0.99      0.99     30524

    accuracy                           0.99     60951
   macro avg       0.99      0.99      0.99     60951
weighted avg       0.99      0.99      0.99     60951

Training XGBoost on Menu Recommendations...

Classification Repor

#### Individual Food Recommendations

In [47]:
# Encode target variable for classification
label_encoder_food = LabelEncoder()
y_food_class_train_encoded = label_encoder_food.fit_transform(y_food_class_train)
y_food_class_test_encoded = label_encoder_food.transform(y_food_class_test)

In [48]:
# Regression
print("Regression - Individual Foods:")
train_and_evaluate_regression(X_food_train_scaled, X_food_test_scaled, y_food_reg_train, y_food_reg_test, "Individual Foods")

Regression - Individual Foods:
Training Linear Regression on Individual Foods...
Linear Regression:
  Mean Squared Error: 0.0441
  R^2: -0.0017
----------------------------------------
Training Random Forest Regressor on Individual Foods...
Random Forest Regressor:
  Mean Squared Error: 0.0477
  R^2: -0.0839
----------------------------------------
Training XGBoost Regressor on Individual Foods...
XGBoost Regressor:
  Mean Squared Error: 0.0486
  R^2: -0.1041
----------------------------------------
Training Support Vector Regressor on Individual Foods...
Support Vector Regressor:
  Mean Squared Error: 0.0500
  R^2: -0.1352
----------------------------------------


In [49]:
# Classification
print("Classification - Individual Foods:")
train_and_evaluate_classification(X_food_train_scaled, X_food_test_scaled, y_food_class_train_encoded, y_food_class_test_encoded, "Individual Foods")

Classification - Individual Foods:
Training Logistic Regression on Individual Foods...

Classification Report for Logistic Regression (Individual Foods):
              precision    recall  f1-score   support

           0       0.92      0.93      0.92      3256
           1       0.93      0.92      0.92      3260

    accuracy                           0.92      6516
   macro avg       0.92      0.92      0.92      6516
weighted avg       0.92      0.92      0.92      6516

Training Random Forest on Individual Foods...

Classification Report for Random Forest (Individual Foods):
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3256
           1       0.98      0.98      0.98      3260

    accuracy                           0.98      6516
   macro avg       0.98      0.98      0.98      6516
weighted avg       0.98      0.98      0.98      6516

Training XGBoost on Individual Foods...

Classification Report for XGBoost (Individua

### GridSearch on Best Two Models

### Evaluate Best Model