<a href="https://colab.research.google.com/github/nrodman/FoodRecommendation/blob/main/xgboostrecipes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from google.colab import drive
drive.mount('/content/drive')
data_dir = '/content/drive/My Drive/project1/'
dataset_path = data_dir + 'reduced.csv'
recipes_df = pd.read_csv(dataset_path)
print("Columns in the merged dataset:")
print(recipes_df.columns)
print("Merged Dataset:")
print(recipes_df.head())

# defining feature columns
feature_columns = ['calories', 'fatcontent', 'proteincontent']
recipes_df = recipes_df.dropna(subset=feature_columns)
features = recipes_df[feature_columns]
target = (recipes_df['rating'] >= 4).astype(int)  # Binary classification: 1 for rating >= 4, else 0

# standardizing the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)

# data splitting into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features_scaled, target, test_size=0.2, random_state=42)

# xgboost model + training and evaluating the model
def train_evaluate_xgboost(X_train, y_train, X_test, y_test):
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

    fold_metrics = []

    for train_index, val_index in kfold.split(X_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        xgb_model.fit(X_train_fold, y_train_fold)
        y_val_pred = xgb_model.predict(X_val_fold)

        accuracy = accuracy_score(y_val_fold, y_val_pred)
        f1 = f1_score(y_val_fold, y_val_pred, average='weighted')
        precision = precision_score(y_val_fold, y_val_pred, average='weighted')
        recall = recall_score(y_val_fold, y_val_pred, average='weighted')

        fold_metrics.append((accuracy, f1, precision, recall))

    # accuracy test
    y_test_pred = xgb_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')

    print(f"\nFinal Test Set Evaluation - Accuracy: {test_accuracy}, F1 Score: {test_f1}, Precision: {test_precision}, Recall: {test_recall}")

    return fold_metrics, (test_accuracy, test_f1, test_precision, test_recall)

# training and evaluating the model again
fold_metrics, test_metrics = train_evaluate_xgboost(X_train, y_train, X_test, y_test)

# printing final test metrics
test_accuracy, test_f1, test_precision, test_recall = test_metrics
print(f"\nFinal Test Set Evaluation - Accuracy: {test_accuracy}, F1 Score: {test_f1}, Precision: {test_precision}, Recall: {test_recall}")

# simulating 1 random user selection from the recipes
random_recipe = recipes_df.sample(1)
selected_recipe_id = random_recipe['recipeid'].values[0]
selected_recipe_name = random_recipe['name'].values[0]
print(f"\nUser Selected Recipe: {selected_recipe_name} (ID: {selected_recipe_id})")

# getting reviewerID
selected_recipe_reviewers = recipes_df[(recipes_df['recipeid'] == selected_recipe_id) & (recipes_df['rating'] == 5)]['reviewer'].unique()

# maximum of 4 reviewers limited
selected_reviewers_limited = selected_recipe_reviewers[:4]

# get top-rated recipes by a reviewer without the selected recipe
def get_top_recipes_by_reviewer(reviewer_id, selected_recipe_id, max_recipes=1):
    reviewer_recipes = recipes_df[(recipes_df['reviewer'] == reviewer_id) & (recipes_df['recipeid'] != selected_recipe_id)]
    top_recipes = reviewer_recipes.sort_values(by='rating', ascending=False).head(max_recipes)
    return top_recipes

# get recommendations
recommended_recipes = pd.DataFrame()

for reviewer in selected_reviewers_limited:
    top_recommendations = get_top_recipes_by_reviewer(reviewer, selected_recipe_id)
    recommended_recipes = pd.concat([recommended_recipes, top_recommendations])

# getting at least 4 recommendations
if recommended_recipes.shape[0] < 4:
    additional_recommendations = recipes_df[(recipes_df['rating'] >= 4) & (recipes_df['recipeid'] != selected_recipe_id) &
                                            (~recipes_df['recipeid'].isin(recommended_recipes['recipeid']))].drop_duplicates(subset='recipeid').head(4 - recommended_recipes.shape[0])
    recommended_recipes = pd.concat([recommended_recipes, additional_recommendations]).drop_duplicates(subset='recipeid')

# printing recommended recipes
print("\nRecommended Recipes Based on Similar Reviewers:")
print(recommended_recipes[['recipeid', 'name', 'reviewer']].head(4))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Columns in the merged dataset:
Index(['recipeid', 'name', 'author', 'totaltime', 'datepublished',
       'description', 'images', 'recipecategory', 'keywords',
       'aggregatedrating', 'reviewcount', 'calories', 'fatcontent',
       'proteincontent', 'recipeinstructions', 'reviewid', 'reviewer',
       'rating', 'bigcategory'],
      dtype='object')
Merged Dataset:
   recipeid                              name  author totaltime  \
0     79703   Easy Seven Layer Rice Casserole   89831   PT1H35M   
1    284696     Baked Corn and Rice Casserole  128473   PT1H20M   
2    105682  Sugar and Cinnamon Spiced Pecans  131500   PT1H10M   
3     26039  Papa John's Garlic Dipping Sauce   27395      PT4M   
4    448521                  Baked Seitan Log  992845   PT1H35M   

               datepublished  \
0  2003-12-29 20:00:00+00:00   
1  2008-02-07 18:31:00+00:00   
2 

  _warn_prf(average, modifier, msg_start, len(result))



Final Test Set Evaluation - Accuracy: 0.945356231674346, F1 Score: 0.9190978784191952, Precision: 0.8942588145126541, Recall: 0.945356231674346
Fold 1 - Accuracy: 0.944880765386263, F1 Score: 0.9181959286018329, Precision: 0.9203992989804688, Recall: 0.944880765386263
Fold 2 - Accuracy: 0.9452139559236518, F1 Score: 0.9186387036331416, Precision: 0.893516943578056, Recall: 0.9452139559236518
Fold 3 - Accuracy: 0.9453541507996953, F1 Score: 0.9187987391055079, Precision: 0.8936944704342131, Recall: 0.9453541507996953
Fold 4 - Accuracy: 0.9451637471439451, F1 Score: 0.9185648261079346, Precision: 0.8934220272783987, Recall: 0.9451637471439451
Fold 5 - Accuracy: 0.9433549124143183, F1 Score: 0.9159965570452263, Precision: 0.8901803246352868, Recall: 0.9433549124143183

Final Test Set Evaluation - Accuracy: 0.945356231674346, F1 Score: 0.9190978784191952, Precision: 0.8942588145126541, Recall: 0.945356231674346

User Selected Recipe: Veggie Quesadilla Triangles (ID: 416463)

Recommended R