## Imports

In [24]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE
from sklearn.metrics import mean_squared_error, make_scorer, r2_score, confusion_matrix

random_seed = 2024

## I. Data-Cleaning

### 1. diet.csv

In [2]:
users_diet_data = pd.read_csv('../aufgabe/training_dataset/diet.csv')
# print(users_diet.head())
# users_diet.shape
# users_diet.dtypes

# Nullwerte
# users_diet.isnull().sum()

# 1. Spalte "Diet"
# -> ein Nullwert drin 
#users_diet[users_diet["Diet"].isna()]

# Author 646062A ohne Wert für Diet -> Weg mit dem Hund
users_diet_data = users_diet_data.drop(users_diet_data[users_diet_data["Diet"].isna()].index).reset_index(drop=True)

# Weitere missing values finden: z.B. <empty field>, "0", ".", "999", "NA" ...
# users_diet[(users_diet["Diet"] == "") | (users_diet["Diet"] == ".") | (users_diet["Diet"] == "999")]


# 2. Spalte "Age"
# users_diet[users_diet["Age"] > 100]
# users_diet[users_diet["Age"] < 5]
# print("Range Alter", users_diet["Age"].min(), users_diet["Age"].max())

# 3. Spalte "AuthorId"
# print("Einzigartige IDs: ", users_diet["AuthorId"].nunique(), users_diet.shape[0])


# Datentyp bei "Diet" zu Category ändern
users_diet_data["Diet"] = users_diet_data['Diet'].astype("category")
users_diet_data.dtypes


AuthorId      object
Diet        category
Age            int64
dtype: object

### 2. reviews.csv

In [3]:
reviews_data = pd.read_csv("../aufgabe/training_dataset/reviews.csv")

# reviews_data_test.dtypes
reviews_userToPredict_data = reviews_data[reviews_data["Rating"].notna()]
print(reviews_userToPredict_data.head())

# Test-Daten -> TestSetId != NaN, die anderen nicht für Modelle verwenden
reviews_data = reviews_data[reviews_data["TestSetId"].isna()].reset_index(drop=True)
reviews_data.loc[reviews_data['Rating'].isna(), "Rating"] = 999
reviews_data["Rating"] = reviews_data["Rating"].astype("category")

# Like column zu boolean
reviews_data["Like"] = reviews_data["Like"].astype("bool")




      AuthorId  RecipeId  Rating Like  TestSetId
0     2492191A     33671     2.0  NaN        1.0
1  2002019979A     92647     2.0  NaN        2.0
3  2001625557E    108231     2.0  NaN        4.0
6      588901B     87380     2.0  NaN        7.0
7     1038235B      9475     2.0  NaN        8.0


  reviews_data = pd.read_csv("../aufgabe/training_dataset/reviews.csv")


### 3. requests.csv

In [4]:
requests_data = pd.read_csv("../aufgabe/training_dataset/requests.csv")
requests_data.head()

# Column: Time
# Runden
""" 
Column Time -> runden, da Nachkommastellen bei Kochzeit irrelevant
Beschreibung: The duration a recipe should take at most (including the time reserved
for the preparation and cooking).
"""
requests_data["Time"] = requests_data["Time"].round(1)

# Teilweise negative Werte -> 0
requests_data.loc[requests_data["Time"] <= 0, "Time"] = 0
"""
Test, ob negative Werte immer False als "Like" haben -> stimmt aber nicht siehe Code:
Requests mit [AuthorId, RecipeId] time <= 0 mit reviews [AuthorId, RecipeId] joinen und dort like checken
joined_data = requests_data.merge(reviews_data_test, on=["AuthorId", "RecipeId"], how="left")
joined_data = joined_data[joined_data["Time"] <= 0]
joined_data = joined_data[~joined_data["Like"].isna()]
"""

# Column: HighCalories
requests_data["HighCalories"] = requests_data["HighCalories"].astype("bool")

# Column: HighProtein
"""
2 Werte: Indifferent und Yes
Daraus wird boolean indifferent = False und Yes = True
"""
requests_data.loc[requests_data["HighProtein"] == "Indifferent", "HighProtein"] = 0
requests_data.loc[requests_data["HighProtein"] == "Yes", "HighProtein"] = 1
requests_data["HighProtein"] = requests_data["HighProtein"].astype("bool")

# Column: LowFat
requests_data["LowFat"] = requests_data["LowFat"].astype("bool")

# Column: LowSugar
"""
2 Werte: Indifferent und 0. Interpretation: 0 -> user braucht kein low-sugar Inhalt, Indifferent -> User ist es egal
Daraus wird boolean 0 = False und indifferent = True
"""
requests_data.loc[requests_data["LowSugar"] == "0", "LowSugar"] = 0
requests_data.loc[requests_data["LowSugar"] == "Indifferent", "LowSugar"] = 1
requests_data["LowSugar"] = requests_data["LowSugar"].astype("bool")

# Column HighFiber
requests_data["HighFiber"] = requests_data["HighFiber"].astype("bool")

# requests_data["HighFiber"].unique()
# requests_data.head(20)


### 4. recipes.csv

In [5]:
recipes_data = pd.read_csv("../aufgabe/training_dataset/recipes.csv")
threshold = 1
threshold_RecipeServings = 3.5

"""
Outlier Detection:
Außerhalb von 3,5*Standardabweichung -> Outlier

=> Outlier werden entfernt
"""
# Name to lower
recipes_data["Name"] = recipes_data["Name"].str.lower()

# Column CookTime und Column PrepTime
print("Shape vorher: ", recipes_data.shape)


# Column RecipeCategory
recipes_data["RecipeCategory"] = recipes_data["RecipeCategory"].apply(lambda x: "Beverage" if x == "Beverages" else ("Meals" if (x == "Lunch") or (x == "Breakfast")  else "Other"))
recipes_data["RecipeCategory"] = recipes_data["RecipeCategory"].astype("category")


# Column RecipeIngredientQuantities und Column RecipeIngredientParts
# zu liste von strings umwandeln
recipes_data["RecipeIngredientQuantities"] = recipes_data["RecipeIngredientQuantities"].str.replace('character(0)', '').str.lstrip('"c("').str.replace('"', '').str.replace(")", "").str.replace('\\', '').str.split(",")
recipes_data["RecipeIngredientParts"] = recipes_data["RecipeIngredientParts"].str.lower().replace('character(0)', '').str.lstrip('"c("').str.replace('"', '').str.replace(")", "").str.replace('\\', '').str.split(",")

# Einteilung ob die Zutat in der Liste ist oder nicht
# Fleisch
value_meat = ["chicken", "veal", "pork", "beef", "turkey", "ham", "bacon", "lamb", "duck", "goose", "rabbit", "venison", "quail", "pheasant", "alligator", "sausage"]
# Meeresfrüchte
value_sea = ["fish", "crab", "lobster", "shrimp", "prawn", "clam", "mussel", "scallop", "squid", "octopus", "anchovy", "sardine", "tuna", "salmon", "trout", "herring", "cod", "mackerel", "bass", "swordfish", "sturgeon", "walleye", "caviar", "crayfish", "cuttlefish", "sea cucumber", "sea snail", "sea bass", "sea bream", "sea trout", "seafood", "shellfish"]
# vegetarisch
value_vegetarian = ["tofu", "seitan", "tempeh", "plant-based"]
# vegan
value_vegan = ["vegan"]

def beinhaltet_substring(ingredient_list, category_list):
    for ingredient in ingredient_list:
        if any(cat in ingredient for cat in category_list):
            return 1
    return 0

recipes_data["Meat"] = recipes_data["RecipeIngredientParts"].apply(lambda x: beinhaltet_substring(x, value_meat))
recipes_data["Seafood"] = recipes_data["RecipeIngredientParts"].apply(lambda x: beinhaltet_substring(x, value_sea))
recipes_data["Vegetarian"] = recipes_data["RecipeIngredientParts"].apply(lambda x: beinhaltet_substring(x, value_vegetarian))
recipes_data["Vegan"] = recipes_data["RecipeIngredientParts"].apply(lambda x: beinhaltet_substring(x, value_vegan))


# weitere Unterteilung, falls mehrere Kategorien anschlagen und falls alles 0, dann vegetarisch 1
for index, row in recipes_data.iterrows():
    if(beinhaltet_substring([row["Name"]], value_vegan) == 1): 
        recipes_data.loc[index, ["Vegan"]] = 1
        recipes_data.loc[index, ["Vegetarian"]] = 0
        recipes_data.loc[index, ["Seafood"]] = 0
        recipes_data.loc[index, ["Meat"]] = 0
    elif(row["Meat"] == 1 or row["Seafood"] == 1):
        recipes_data.loc[index, ["Vegetarian"]] = 0
        recipes_data.loc[index, ["Vegan"]] = 0
    elif(row["Vegetarian"] == 0 and row["Vegan"] == 0):
        recipes_data.loc[index, ["Vegetarian"]] = 1


# print(recipes_data[["RecipeId", "Meat", "Seafood", "Vegetarian", "Vegan"]].head(20))
# print(recipes_data[(recipes_data["Vegan"] == 0) & (recipes_data["Vegetarian"] == 0) & (recipes_data["Meat"] == 0) & (recipes_data["Seafood"] == 0)].value_counts())


# Column Calories
# Outlier weg
recipes_data["Calories"] = recipes_data["Calories"].apply(lambda x: np.log(x) if x > 0 else x)

std_Calories = recipes_data['Calories'].std()
mean_Calories = recipes_data['Calories'].mean()
upper_limit_Calories = mean_Calories + threshold * std_Calories
lower_limit_Calories = mean_Calories - threshold * std_Calories

recipes_data["High-Calories-class"] = recipes_data["Calories"].apply(lambda x: 1 if x > upper_limit_Calories else 0)



# recipes_data = recipes_data[(recipes_data["Calories"] >= lower_limit_Calories) & (recipes_data['Calories'] <= upper_limit_Calories)]


# Column FatContent
# Oulier weg

recipes_data["FatContent"] = recipes_data["FatContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_FatContent = recipes_data['FatContent'].std()
mean_FatContent = recipes_data['FatContent'].mean()
upper_limit_FatContent = mean_FatContent + threshold * std_FatContent
lower_limit_FatContent = mean_FatContent - threshold * std_FatContent

recipes_data["Low-FatContent-class"] = recipes_data["FatContent"].apply(lambda x: 1 if x > lower_limit_FatContent else 0)

# recipes_data = recipes_data[(recipes_data["FatContent"] >= lower_limit_FatContent) & (recipes_data['FatContent'] <= upper_limit_FatContent)]


# Column SaturatedFatContent
# Outlier weg
recipes_data["SaturatedFatContent"] = recipes_data["SaturatedFatContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_SaturatedFatContent = recipes_data['SaturatedFatContent'].std()
mean_SaturatedFatContent = recipes_data['SaturatedFatContent'].mean()
upper_limit_SaturatedFatContent = mean_SaturatedFatContent + threshold * std_SaturatedFatContent
lower_limit_SaturatedFatContent = mean_SaturatedFatContent - threshold * std_SaturatedFatContent

recipes_data["Low-SaturatedFatContent-class"] = recipes_data["SaturatedFatContent"].apply(lambda x: 1 if x > lower_limit_SaturatedFatContent else 0)

# recipes_data = recipes_data[(recipes_data["SaturatedFatContent"] >= lower_limit_SaturatedFatContent) & (recipes_data['SaturatedFatContent'] <= upper_limit_SaturatedFatContent)]


# Column CholesterolContent
# Outlier weg
recipes_data["CholesterolContent"] = recipes_data["CholesterolContent"].apply(lambda x: np.log(x + 1) if x > 0 else x)

std_CholesterolContent = recipes_data['CholesterolContent'].std()
mean_CholesterolContent = recipes_data['CholesterolContent'].mean()
upper_limit_CholesterolContent = mean_CholesterolContent + threshold * std_CholesterolContent
lower_limit_CholesterolContent = mean_CholesterolContent - threshold * std_CholesterolContent

recipes_data["High-CholesterolContent-class"] = recipes_data["CholesterolContent"].apply(lambda x: 1 if x > upper_limit_CholesterolContent else 0)

# recipes_data = recipes_data[(recipes_data["CholesterolContent"] >= lower_limit_CholesterolContent) & (recipes_data['CholesterolContent'] <= upper_limit_CholesterolContent)]


# Column SodiumContent
# Outlier weg
recipes_data["SodiumContent"] = recipes_data["SodiumContent"].apply(lambda x: np.log(x + 1) if x > 0 else x)

std_SodiumContent = recipes_data['SodiumContent'].std()
mean_SodiumContent = recipes_data['SodiumContent'].mean()
upper_limit_SodiumContent = mean_SodiumContent + threshold * std_SodiumContent
lower_limit_SodiumContent = mean_SodiumContent - threshold * std_SodiumContent

recipes_data["High-SodiumContent-class"] = recipes_data["SodiumContent"].apply(lambda x: 1 if x > upper_limit_SodiumContent else 0)

# recipes_data = recipes_data[(recipes_data["SodiumContent"] >= lower_limit_SodiumContent) & (recipes_data['SodiumContent'] <= upper_limit_SodiumContent)]


# Column CarbohydrateContent
# Outlier weg
recipes_data["CarbohydrateContent"] = recipes_data["CarbohydrateContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_CarbohydrateContent = recipes_data['CarbohydrateContent'].std()
mean_CarbohydrateContent = recipes_data['CarbohydrateContent'].mean()
upper_limit_CarbohydrateContent = mean_CarbohydrateContent + threshold * std_CarbohydrateContent
lower_limit_CarbohydrateContent = mean_CarbohydrateContent - threshold * std_CarbohydrateContent

recipes_data["Low-CarbsContent-class"] = recipes_data["CarbohydrateContent"].apply(lambda x: 1 if x > lower_limit_CarbohydrateContent else 0)

# recipes_data = recipes_data[(recipes_data["CarbohydrateContent"] >= lower_limit_CarbohydrateContent) & (recipes_data['CarbohydrateContent'] <= upper_limit_CarbohydrateContent)]


# Column FiberContent
# Outlier weg
recipes_data["FiberContent"] = recipes_data["FiberContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_FiberContent = recipes_data['FiberContent'].std()
mean_FiberContent = recipes_data['FiberContent'].mean()
upper_limit_FiberContent = mean_FiberContent + threshold * std_FiberContent
lower_limit_FiberContent = mean_FiberContent - threshold * std_FiberContent

recipes_data["High-FiberContent-class"] = recipes_data["FiberContent"].apply(lambda x: 1 if x > upper_limit_FiberContent else 0)

# recipes_data = recipes_data[(recipes_data["FiberContent"] >= lower_limit_FiberContent) & (recipes_data['FiberContent'] <= upper_limit_FiberContent)]


# Column SugarContent
# Outlier weg
recipes_data["SugarContent"] = recipes_data['SugarContent'].apply(lambda x: np.log(x+1) if x > 0 else x)

std_SugarContent = recipes_data['SugarContent'].std()
mean_SugarContent = recipes_data['SugarContent'].mean()
upper_limit_SugarContent = mean_SugarContent + threshold * std_SugarContent
lower_limit_SugarContent = mean_SugarContent - threshold * std_SugarContent

recipes_data["Low-SugarContent-class"] = recipes_data["SugarContent"].apply(lambda x: 1 if x > lower_limit_SugarContent else 0)

# recipes_data = recipes_data[(recipes_data["SugarContent"] >= lower_limit_SugarContent) & (recipes_data['SugarContent'] <= upper_limit_SugarContent)]



# Column ProteinContent
# Outlier weg
recipes_data["ProteinContent"] = recipes_data["ProteinContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_ProteinContent = recipes_data['ProteinContent'].std()
mean_ProteinContent = recipes_data['ProteinContent'].mean()
upper_limit_ProteinContent = mean_ProteinContent + threshold * std_ProteinContent
lower_limit_ProteinContent = mean_ProteinContent - threshold * std_ProteinContent

recipes_data["High-ProteinContent-class"] = recipes_data["ProteinContent"].apply(lambda x: 1 if x > upper_limit_ProteinContent else 0)

# recipes_data = recipes_data[(recipes_data["ProteinContent"] >= lower_limit_ProteinContent) & (recipes_data['ProteinContent'] <= upper_limit_ProteinContent)]


# Column RecipeServings & Column RecipeYield
"""
RecipeServings: Anzahl der Portionen, die das Rezept ergibt
RecipeYield: Gibt an, wie viele Stücke man aus dem Rezept erhält. Ein Rezept ergibt zum Beispiel 1/2 Liter Suppe, was 2 Portionen entspricht.
-> Versuch: RecipeYield zu standardisieren z.B. in Liter, Gramm, Stück, ... -> Problem: >2000 verschiedene Einheiten (zu viele) -> RecipeYield nicht verwenden
-> Stattdessen auf RecipeServings zurückgreifen und fehlende Werte durch randomforest imputieren

Wichtig: Da RecipeServings schlecht verteilt ist (z.b. meisten Werte zwischen 0 und 10, aber auch Werte >1000) werden die Modelle ungenau
-> Lösung: Verwendung von Klassen: Einteilung in Serving-Size: small, medium, large, ...
-> Wichtig: One-hot-encoding nutzen (da es sich um Kategorien handelt) -> 3 Spalten: small, medium, large

"""

# Correlation Matrix von numerischen Werten
# corr = recipes_data[["CookTime", "PrepTime", "Calories", "FatContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent", "RecipeServings"]].corr(numeric_only=True)
# print(corr)

# RecipeYield
# recipes_data["RecipeYield_Quantity"] = recipes_data["RecipeYield"].str.split(" ").str[0]
# recipes_data["RecipeYield_Unit"] = recipes_data["RecipeYield"].str.split(" ").str[1]
recipes_data.drop(columns=["RecipeYield"], inplace=True)


# RecipeServings
# Logarithmieren von RecipeServings, um die Verteilung zu verbessern
recipes_data["RecipeServings"] = recipes_data["RecipeServings"].apply(lambda x: np.log(x) if x > 0 else x)

# plt.figure(figsize=(10, 6))
# plt.hist(recipes_data['RecipeServings'], bins=10)
# plt.title('Histogram of Recipe Servings')
# plt.xlabel('Recipe Servings')
# plt.ylabel('Frequency')
# plt.show()

# Outlier weg
std_RecipeServings = recipes_data[recipes_data['RecipeServings'].notna()]["RecipeServings"].std()
mean_RecipeServings = recipes_data[recipes_data['RecipeServings'].notna()]["RecipeServings"].mean()
# print(recipes_data['RecipeServings'].describe())
"""
mean         1.770398
std          0.781950
min          0.000000
25%          1.386294 -> 25% der Werte sind 1.386294 oder kleiner -> small
50%          1.791759 -> 50% der Werte sind 1.791759 oder kleiner -> medium
75%          2.079442 -> 75% der Werte sind 2.079442 oder kleiner -> large
max          6.907755
"""

upper_limit_RecipeServings = mean_RecipeServings + threshold_RecipeServings * std_RecipeServings
lower_limit_RecipeServings = mean_RecipeServings - threshold_RecipeServings * std_RecipeServings

recipes_data = recipes_data[((recipes_data["RecipeServings"] >= lower_limit_RecipeServings) & (recipes_data['RecipeServings'] <= upper_limit_RecipeServings)) | (recipes_data['RecipeServings'].isna())]
print("Shape nach Kürzung der Outlier: ", recipes_data.shape)

recipes_data_description = recipes_data.describe()

# Conditions
conditions = [
    (recipes_data["RecipeServings"] <= recipes_data_description.loc["25%", "RecipeServings"]),
    (recipes_data["RecipeServings"] > recipes_data_description.loc["25%", "RecipeServings"]) & (recipes_data["RecipeServings"] <= recipes_data_description.loc["75%", "RecipeServings"]),
    (recipes_data["RecipeServings"] > recipes_data_description.loc["75%", "RecipeServings"])
]
# choices: small - 0, medium - 1, large - 2
choices = [0, 1, 2]

# Discretization von RecipeServings -> Klassen: small = 0, medium = 1, large = 2 -> one-hot-encoding
recipes_data["ServingClass"] = np.select(conditions, choices, default=np.nan)
# Werte mit NA werden durch median 1 ersetzt.
recipes_data.loc[recipes_data['RecipeServings'].isna(), "ServingClass"] = 1

# In Csv
# path_recipesData_marco = "../cvs/Marco/recipes_data.csv"
# recipes_data.to_csv(path_recipesData_marco, index=False)



# Versuch: Fehlende Daten zu imputieren, nicht geklappt!
"""
# fehlende Werte in RecipeServings durch random forest imputieren
features = ['CookTime', 'PrepTime', 'Calories', 'FatContent', 'FiberContent', 'ProteinContent']

# test_data with and without RecipeServings, training_data only with RecipeServings
known_servings = recipes_data[recipes_data['ServingClass'].notna()]
unknown_servings = recipes_data[recipes_data['ServingClass'].isna()]

X = known_servings[features]
y_known = known_servings["ServingClass"]

# print(X)
# print(y_known)
X_train, X_val, y_train, y_val = train_test_split(X, y_known, test_size=0.3, random_state=random_seed)
# print('Training Features Shape:', X_train.shape)
# print('Training Labels Shape:', y_train.shape)
# print('Validation Features Shape:', X_val.shape)
# print('Validation Labels Shape:', y_val.shape)


# Model trainieren
model = RandomForestClassifier(n_estimators=200, random_state=random_seed)
model.fit(X_train, y_train)

# Model evaluieren
y_pred = model.predict(X_val)

# Bewertung des Modells
mse = mean_squared_error(y_val, y_pred)
# calculate r squared
r_squared = model.score(X_val, y_val)
print('R Squared: ', r_squared)
print('MSE Mean Squred Error: ', mse) 


# y_val together with y_pred 
y_val_pred = pd.DataFrame({'y_val': y_val, 'y_pred': y_pred})
# Extract RecipeId for the validation set
recipe_ids_val = recipes_data.loc[y_val.index, 'RecipeId']
# Add RecipeId to the y_val_pred dataframe
y_val_pred['RecipeId'] = recipe_ids_val

# Display the dataframe
print(y_val_pred.head(20))


# Model anwenden um fehlende Daten zu analysieren
# X_unknown = unknown_servings[features]
# y_unknown = unknown_servings["RecipeServings"]
# y_unknown_pred = model.predict(X_unknown)


"""





Shape vorher:  (75604, 18)
Shape nach Kürzung der Outlier:  (75524, 30)


'\n# fehlende Werte in RecipeServings durch random forest imputieren\nfeatures = [\'CookTime\', \'PrepTime\', \'Calories\', \'FatContent\', \'FiberContent\', \'ProteinContent\']\n\n# test_data with and without RecipeServings, training_data only with RecipeServings\nknown_servings = recipes_data[recipes_data[\'ServingClass\'].notna()]\nunknown_servings = recipes_data[recipes_data[\'ServingClass\'].isna()]\n\nX = known_servings[features]\ny_known = known_servings["ServingClass"]\n\n# print(X)\n# print(y_known)\nX_train, X_val, y_train, y_val = train_test_split(X, y_known, test_size=0.3, random_state=random_seed)\n# print(\'Training Features Shape:\', X_train.shape)\n# print(\'Training Labels Shape:\', y_train.shape)\n# print(\'Validation Features Shape:\', X_val.shape)\n# print(\'Validation Labels Shape:\', y_val.shape)\n\n\n# Model trainieren\nmodel = RandomForestClassifier(n_estimators=200, random_state=random_seed)\nmodel.fit(X_train, y_train)\n\n# Model evaluieren\ny_pred = model.pre

## II. Feature Engineering


## III. Data Integration

In [6]:
# Verbinden von reviews_df mit diet_df über AuthorId
combined_df1 = pd.merge(reviews_data, users_diet_data, on='AuthorId', how='left')

# Verbinden von combined_df1 mit requests_df über AuthorId und RecipeId
combined_df2 = pd.merge(combined_df1, requests_data, on=['AuthorId', 'RecipeId'], how='left')

# Verbinden von combined_df2 mit recipes_df über RecipeId
final_combined_df = pd.merge(combined_df2, recipes_data, on='RecipeId', how='left')


# Clean data without nan values - z.B. falls Rezept gelöscht wurde usw.
print("Shape davor", final_combined_df.shape)
final_combined_df = final_combined_df[final_combined_df["Time"].notna()]
final_combined_df = final_combined_df[final_combined_df["Name"].notna()]
print("Shape danach", final_combined_df.shape)


csv_file_path = 'final_table.csv'

# Save the DataFrame to CSV
final_combined_df.to_csv(csv_file_path, index=False)

# Returning the file path for download
print(csv_file_path)


# print(final_combined_df["ServingClass"].describe())
# plt.hist(final_combined_df["ServingClass"], bins=3)
# plt.show()



Shape davor (97381, 43)
Shape danach (97281, 43)
final_table.csv


## III. Daten nochmal weiter verfeinern - Marco

In [34]:
from sklearn.preprocessing import StandardScaler

# 0. Nützliche Funktionen
def perfomance_measure(y_true, y_pred):
    TP, FP, TN, FN = confusion_matrix(y_true, y_pred).ravel()
    recall = TP / (TP + FN)
    specificity = TN / (TN + FP)
    BAC = (recall + specificity) / 2
    return BAC

def oneHotEncoding(df, columns):
    df = pd.get_dummies(df, columns=columns, prefix=columns)
    return df


def addNewColumns(df):

    # 1. Falls time - (cooktime + preptime) > 0 oder nur um 1% abweicht, dann in-time, ansonsten nicht in-time
    df["InTime"] = df["Time"] - (df["CookTime"] + df["PrepTime"])
    df["InTime_binary"] = df.apply(
        lambda row: 1 if row["Time"] - (row["CookTime"] + row["PrepTime"]) >= -0.02 * row["Time"] else 0, 
        axis=1
    )

    # df[(df["InTime_binary"] == 0) & (df["Like"] == True)]


    # 2. Falls (HighCalories == True & True und High-Calories-class == True) dann HighCaloriesMatch = True, (HighCalories == False & True und High-Calories-class == False) dann HighCaloriesMatch = True, ansonsten False

    df["HighCaloriesMatch"] = df.apply(
        lambda row: 1 if (row["HighCalories"] == True and row["High-Calories-class"] == True) or (row["HighCalories"] == False and row["High-Calories-class"] == False) else 0, 
        axis=1
    )

    # df["HighCaloriesMatch"] = df.apply(
    #     lambda row: 1 if row["HighCalories"] == True and row["High-Calories-class"] == True else 0, 
    #     axis=1
    # )

    # df[df["HighCaloriesMatch"] == 1][['HighCalories', 'High-Calories-class', 'HighCaloriesMatch']]


    # 3. Falls (LowFat == True & True und Low-FatContent-class == True) dann LowFatMatch = True, dann true, ansonsten False
    df["LowFatMatch"] = df.apply(
        lambda row: 1 if (row["LowFat"] == True and row["Low-FatContent-class"] == True) or (row["LowFat"] == False and row["Low-FatContent-class"] == False) else 0, 
        axis=1
    )


    # 4. Falls (LowSugar == True & True und Low-SugarContent-class == True) dann LowSugarMatch = True, dann true, ansonsten False
    df["LowSugarMatch"] = df.apply(
        lambda row: 1 if (row["LowSugar"] == True and row["Low-SugarContent-class"] == True) or (row["LowSugar"] == False and row["Low-SugarContent-class"] == False) else 0, 
        axis=1
    )


    # 5. Falls (HighProtein == True & True und High-ProteinContent-class == True) dann HighProteinMatch = True, dann HighProteinMatch = True, ansonsten False
    df["HighProteinMatch"] = df.apply(
        lambda row: 1 if (row["HighProtein"] == True and row["High-ProteinContent-class"] == True) or (row["HighProtein"] == False and row["High-ProteinContent-class"] == False) else 0, 
        axis=1
    )


    # 6. Falls (HighFiber == True & True und High-FiberContent-class == True) dann HighFiberMatch = True, dann HighFiberMatch = True, ansonsten False
    df["HighFiberMatch"] = df.apply(
        lambda row: 1 if (row["HighFiber"] == True and row["High-FiberContent-class"] == True) or (row["HighFiber"] == False and row["High-FiberContent-class"] == False) else 0, 
        axis=1
    )


    # 7. One-hot-encoding für Diet, RecipeCategory, ServingClass
    df = oneHotEncoding(df, ["Diet", "RecipeCategory"])


    # 8. Falls (Diet_Vegan == True & Vegan == True)dann VeganMatch = True, ansonsten False
    df["VeganMatch"] = df.apply(
        lambda row: 1 if row["Diet_Vegan"] == True and row["Vegan"] == True else 0,
        axis=1
    )


    # 9. Falls (Diet_Vegetarian == True & Vegetarian == True) or (Diet_Vegetarian == True & Vegan == True) dann VegetarianMatch = True, ansonsten False
    df["VegetarianMatch"] = df.apply(
        lambda row: 1 if (row["Diet_Vegetarian"] == True and row["Vegetarian"] == True) or (row["Diet_Vegetarian"] == True and row["Vegan"] == True) else 0,
        axis=1
    )
    df.to_csv("final_table_Marco.csv", index=False)


    df.columns
    # principal_components_final_df.columns

    return df


def logData(df, columns):
    for column in columns:
        df[column] = df[column].apply(lambda x: np.log(x) if x > 0 else x)
    return df

# Daten normalisieren
def dataNormalize(df, normalized_columns):
    scaler = StandardScaler()
    for column in normalized_columns:
        df[column] = scaler.fit_transform(df[[column]])
    return df



### Bereits hier in Testdaten und Trainingsdaten aufteilen

In [17]:
clean_final_combined_df = final_combined_df.copy()
clean_final_combined_df = addNewColumns(clean_final_combined_df)

# Prozentzahl Test- und Trainingsdaten
testSize = 0.1

# Aufteilen in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(clean_final_combined_df.drop("Like", axis=1), clean_final_combined_df["Like"].astype("int"), test_size=testSize, random_state=random_seed)

columns_die_logarithmiert_werden = ["Time", "CookTime", "PrepTime"]
X_train = logData(X_train, columns_die_logarithmiert_werden)

# Nach dem Aufteilen in Trainings- und Testdaten wird normalisiert -> davor schlecht, da sonst Testdaten mit in die Normalisierung einfließen -> Data Leakage
# Normalisierung hoffentlich besser für Modelle
columns_die_normalisiert_werden = ["Time", "CookTime", "PrepTime", "Calories", "FatContent", 
                                   "SaturatedFatContent", "CholesterolContent", "SodiumContent", "CarbohydrateContent", 
                                   "FiberContent", "SugarContent", "ProteinContent", "RecipeServings", "InTime"
                                   , "ServingClass"]


X_train = dataNormalize(X_train, columns_die_normalisiert_werden)
X_test = dataNormalize(X_test, columns_die_normalisiert_werden)

X_train.to_csv("final_table_Marco.csv", index=False)

# histogramme nach der Normalisierung
# X_train.hist(bins=50, figsize=(20,15))

# clean_final_combined_df.to_csv("final_table_Marco.csv", index=False)

In [18]:
necessary_columns = [
    "Age", 
    "Calories", "FiberContent", "ProteinContent", 
    # "SugarContent", 
    # "FatContent", "CarbohydrateContent",
    # "SaturatedFatContent", "CholesterolContent", "SodiumContent", 
    "InTime", 
    "HighCaloriesMatch", "HighProteinMatch", "HighFiberMatch", 
    # "LowFatMatch", 
    # "LowSugarMatch", 
    # "RecipeCategory_Beverage", "RecipeCategory_Meals", 
    # "RecipeCategory_Other", 
    # "ServingClass",
    # "VeganMatch", 
    # "VegetarianMatch",
    
    # nicht ganz sicher, ob die hier nötig sind, eigentlich abhängig von adernen Spalten
    "Time", 
    "CookTime", "PrepTime", 
    # "LowFat", "LowSugar",
    # "HighCalories", "HighProtein", "HighFiber",
    # "InTime_binary",
    # "Diet_Omnivore", "Diet_Vegan", "Diet_Vegetarian", 
    # "Meat", "Seafood", "Vegetarian", 
    # "Vegan", 
]

X_train_clean = X_train[necessary_columns]
X_test_clean = X_test[necessary_columns]

## IV. Modelle Marco

### 1. Linear Regression

In [29]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif


# Model 1: Linear Regression
# Convariance Matrix
X_train_clean.corr().to_csv("covariance_matrix.csv")


X_train_clean = X_train_clean.astype(float)
X_train_clean = sm.add_constant(X_train_clean)
X_test_clean = X_test_clean.astype(float)
X_test_clean = sm.add_constant(X_test_clean)

# print(X_train_clean.columns)


# for index, variable_name in enumerate(X_train_clean.columns):
#     if variable_name == "const": 
#         continue
#     print(f"VIF for variable {variable_name} is {vif(X_train_clean, index)}")

# Gewichtung der Daten - 8 mal so viel Gewicht auf Like = True
weigths = y_train.apply(lambda x: 9 if x == 1 else 1)

model_OLS = sm.WLS(y_train, X_train_clean, weights=weigths).fit()
# print(model_OMS.summary())

# Prediction
# Training set
y_pred_train_OLS = model_OLS.predict(X_train_clean)
y_pred_train_OLS_binary = (y_pred_train_OLS > 0.5).astype(int)
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_pred_train_OLS_binary).ravel()
recall_train, specificity_train, BAC_train = perfomance_measure(tp_train, fp_train, tn_train, fn_train)
print("Training Set Evaluation:")
print("BAC train: ", BAC_train)
# r_squeared_train = r2_score(y_train, y_pred_train_OLS)
# print("R Squared train: ", r_squeared_train)

# Validation set
y_pred_OLS = model_OLS.predict(X_test_clean)
y_pred_OLS_binary = (y_pred_OLS > 0.5).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_OLS_binary).ravel()
recall, specificity, BAC = perfomance_measure(tp, fp, tn, fn)
print("Test Set Evaluation:")
print("BAC: ", BAC)
# r_squeared = r2_score(y_test, y_pred_OLS)
# print("R Squared: ", r_squeared)



Training Set Evaluation:
BAC train:  0.6908756130224143
Test Set Evaluation:
BAC:  0.6845786883100315


### 2. Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression


smote_LogReg = SMOTE(random_state=random_seed, sampling_strategy = 1)
X_train_resampled, y_train_resampled = smote_LogReg.fit_resample(X_train_clean, y_train)

print("After OverSampling, counts of label '1': {}".format(sum(y_train_resampled == 1)))
print("After OverSampling, counts of label '0': {}".format(sum(y_train_resampled == 0)))


model_LogReg = LogisticRegression(random_state=random_seed, max_iter=1000)
model_LogReg.fit(X_train_resampled, y_train_resampled)

# Prediction
y_pred_LogReg = model_LogReg.predict(X_test_clean)
print("y_pred: ", y_pred_LogReg)

# Bewertung
tn_train_LogReg, fp_train_LogReg, fn_train_LogReg, tp_train_LogReg = confusion_matrix(y_train_resampled, model_LogReg.predict(X_train_resampled)).ravel()
recall_train_LogReg, specificity_train, BAC_train_LogReg = perfomance_measure(tp_train_LogReg, fp_train_LogReg, tn_train_LogReg, fn_train_LogReg)
print("Training Set Evaluation:")
print("BAC train: ", BAC_train_LogReg)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred_LogReg).ravel()
recall, specificity, BAC = perfomance_measure(tp, fp, tn, fn)
print("Test Set Evaluation:")
print("BAC: ", BAC)


After OverSampling, counts of label '1': 75972
After OverSampling, counts of label '0': 75972
y_pred:  [0 0 0 ... 1 0 1]
Training Set Evaluation:
BAC train:  0.7002316642973727
Test Set Evaluation:
BAC:  0.6889988121331405


### 3. Naive Bayes

In [47]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier


smote_NB = SMOTE(random_state=random_seed, sampling_strategy = 0.2)
X_train_resampled_NB, y_train_resampled_NB = smote_NB.fit_resample(X_train_clean, y_train)

model_NB = GaussianNB()
# model_NB.fit(X_train_resampled_NB, y_train_resampled_NB)

bagging_nb = BaggingClassifier(
    estimator=model_NB,
    n_estimators=60,
    random_state=random_seed
)

bagging_nb.fit(X_train_resampled_NB, y_train_resampled_NB)

# Prediction - Bias vs Variance
# Training Data 
y_train_pred_NB = bagging_nb.predict(X_train_resampled_NB)
BAC_train = perfomance_measure(y_train_resampled_NB, y_train_pred_NB)
print("Training Set Evaluation:")
print("BAC: ", BAC_train)

# Test Data
y_pred_NB = bagging_nb.predict(X_test_clean)
BAC = perfomance_measure(y_test, y_pred_NB)
print("Test Set Evaluation:")
print("BAC: ", BAC)

k = 10  # Anzahl folgs
cv = StratifiedKFold(n_splits=k, shuffle=True, random_state=random_seed)

scores = cross_val_score(bagging_nb, X_train_resampled_NB, y_train_resampled_NB, 
                         scoring=make_scorer(perfomance_measure), cv=cv)

print(scores)



Training Set Evaluation:
BAC:  0.937834106029238
Test Set Evaluation:
BAC:  0.9335271766943793
[0.92368464 0.9381871  0.95014273 0.93880881 0.94789656 0.95174777
 0.93818662 0.91864899 0.92075938 0.93521461]


### 4. Random Forest

In [117]:
from sklearn.ensemble import RandomForestClassifier

smote_RF = SMOTE(random_state=random_seed, sampling_strategy = 1)
X_train_resampled_RF, y_train_resampled_RF = smote_RF.fit_resample(X_train_clean, y_train)

model_RF = RandomForestClassifier(random_state=random_seed, 
                                  n_estimators=100,
                                  max_features="sqrt",   # Einsatz der Qaudratwurzel der Feature-Anzahl = default
                                  max_depth=17  
                                  )
model_RF.fit(X_train_resampled_RF, y_train_resampled_RF)
# model_RF.fit(X_train_clean, y_train)

# Prediction - Bias vs Variance Tradeoff
# Training Data
y_train_pred_RF = model_RF.predict(X_train_resampled_RF)
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train_resampled_RF, y_train_pred_RF).ravel()
recall_train, specificity_train, BAC_train = perfomance_measure(tp_train, fp_train, tn_train, fn_train)
print("Training Set Evaluation:")
print("BAC: ", BAC_train)

# Validation Data
y_pred_RF = model_RF.predict(X_test_clean)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_RF).ravel()
recall, specificity, BAC = perfomance_measure(tp, fp, tn, fn)
print("Test Set Evaluation:")
print("BAC: ", BAC)

Training Set Evaluation:
BAC:  0.9478886958353077
Test Set Evaluation:
BAC:  0.6003122085211637


### 5. Stacking - Kombination der besten Modelle

In [72]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier


smote_Stacking = SMOTE(random_state=random_seed, sampling_strategy = 0.9)
X_train_resampled_Stacking, y_train_resampled_Stacking = smote_Stacking.fit_resample(X_train_clean, y_train)

# Base Models
# model_LogReg = LogisticRegression(random_state=random_seed, max_iter=1000)
model_NB = GaussianNB()
model_RF = RandomForestClassifier(random_state=random_seed, 
                                  n_estimators=100,
                                  max_features="sqrt",   # Einsatz der Qaudratwurzel der Feature-Anzahl = default
                                  max_depth=17   
                                  )
model_Ridge = RidgeClassifier(random_state=random_seed)

base_models = [
    # ("LogReg", model_LogReg),
    ("NB", model_NB),
    ("RF", model_RF),
    # ("Ridge", model_Ridge)
]

# Meta Model
meta_model = LogisticRegression(random_state=random_seed, max_iter=1000)

# Stacking Classifier
stacking = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model
    # , cv=5 # Cross Validation
)

stacking.fit(X_train_resampled_Stacking, y_train_resampled_Stacking)

# Prediction - Bias vs Variance Tradeoff
# Training Data
y_train_pred_Stacking = stacking.predict(X_train_resampled_Stacking)
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train_resampled_Stacking, y_train_pred_Stacking).ravel()
recall_train, specificity_train, BAC_train = perfomance_measure(tp_train, fp_train, tn_train, fn_train)
print("Training Set Evaluation:")
print("BAC: ", BAC_train)

# Validation Data
y_pred_Stacking = stacking.predict(X_test_clean)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_Stacking).ravel()
recall, specificity, BAC = perfomance_measure(tp, fp, tn, fn)
print("Test Set Evaluation:")
print("BAC: ", BAC)


Training Set Evaluation:
BAC:  0.9709197258786894
Test Set Evaluation:
BAC:  0.7697667175279116


### 6. PCA nutzen