## Imports

In [46]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import seaborn as sns

random_seed = 2024

## I. Data-Cleaning

### 1. diet.csv

In [14]:
users_diet = pd.read_csv('../aufgabe/training_dataset/diet.csv')
# print(users_diet.head())
# users_diet.shape
# users_diet.dtypes

# Nullwerte
# users_diet.isnull().sum()

# 1. Spalte "Diet"
# -> ein Nullwert drin 
#users_diet[users_diet["Diet"].isna()]
# Author 646062A ohne Wert für Diet -> Weg mit dem Hund
users_diet = users_diet.drop(users_diet[users_diet["Diet"].isna()].index).reset_index(drop=True)

# 2.  Weitere missing values finden: z.B. <empty field>, "0", ".", "999", "NA" ...
# users_diet[(users_diet["Diet"] == "") | (users_diet["Diet"] == ".") | (users_diet["Diet"] == "999")]
#keine Nullwerte mehr


# 3. Spalte "Age"
#print(users_diet[users_diet["Age"] > 100])
#print (users_diet[users_diet["Age"] < 5])
# print("Range Alter", users_diet["Age"].min(), users_diet["Age"].max())
# -> keine werte < 5 oder > 100

# 3. Spalte "AuthorId"
# print("Einzigartige IDs: ", users_diet["AuthorId"].nunique(), users_diet.shape[0])
#duplicate_authorid_counts = users_diet['AuthorId'].duplicated().sum()
#print(duplicate_authorid_counts)
# -> keine Duplikate


# Datentyp bei "Diet" zu Category ändern
users_diet["Diet"] = users_diet['Diet'].astype("category")
users_diet.dtypes
users_diet.describe(include="all")


Unnamed: 0,AuthorId,Diet,Age
count,271906,271906,271906.0
unique,271906,3,
top,10000120E,Vegetarian,
freq,1,143383,
mean,,,48.503674
std,,,17.898141
min,,,18.0
25%,,,33.0
50%,,,48.0
75%,,,64.0


### 2. reviews.csv

In [35]:
reviews_data = pd.read_csv("../aufgabe/training_dataset/reviews.csv")



# 1. Adjusting the 'Rating' column
# Set 'Rating' to True if it's 2, and False otherwise (including NaN)
reviews_data['Rating'] = reviews_data['Rating'] == 2
reviews_data["Rating"] = reviews_data["Rating"].astype("bool")


# 2. check for duplicates (gleiche AuthorId und RecipeId)
duplicate_entries_counts = reviews_data.duplicated(subset=['AuthorId', 'RecipeId']).sum()
duplicate_entries_counts
# -> keine Duplikate

# 3.  Test-Daten -> TestSetId != NaN, die anderen nicht für Modelle verwenden
reviews_data_test = reviews_data[reviews_data["TestSetId"].isna()].drop('TestSetId', axis=1).reset_index(drop=True)
reviews_data_test["Like"] = reviews_data_test["Like"].astype("bool")

# 4. Trainings-Daten -> TestSetId == NaN
reviews_data_training = reviews_data[reviews_data["TestSetId"].notna()].drop('Like', axis=1).reset_index(drop=True)



#Output: 
# rating ist true bei 2, false bei NA
# reviews_data_test: 10000 rows × 4 columns, 0 missing values, AuthorId, RecipeId, Rating, Like (TestSetId nicht mehr vorhanden)
# reviews_data_training: 40000 rows × 4 columns, 0 missing values, AuthorId, RecipeId, Rating, TestSetId  (like nicht mehr vorhanden)
# reviews_data: 50000 rows × 5 columns, 0 missing values, AuthorId, RecipeId, Rating, Like, TestSetId (alles vorhanden, und clean)

print(reviews_data_test.head())
print(reviews_data_training.head())
print(reviews_data.head())





   AuthorId  RecipeId  Rating   Like
0  1000036C    320576   False  False
1  1000216B    189335   False  False
2  1000221A    133043    True  False
3  1000221A     90537    True  False
4  1000221A    334314    True  False
      AuthorId  RecipeId  Rating  TestSetId
0     2492191A     33671    True        1.0
1  2002019979A     92647    True        2.0
2      408594E    161770   False        3.0
3  2001625557E    108231    True        4.0
4  2001427116E     71109   False        5.0
      AuthorId  RecipeId  Rating Like  TestSetId
0     2492191A     33671    True  NaN        1.0
1  2002019979A     92647    True  NaN        2.0
2      408594E    161770   False  NaN        3.0
3  2001625557E    108231    True  NaN        4.0
4  2001427116E     71109   False  NaN        5.0


  reviews_data = pd.read_csv("../aufgabe/training_dataset/reviews.csv")


### 3. requests.csv

In [68]:
requests_data = pd.read_csv("../aufgabe/training_dataset/requests.csv")
requests_data.head()

# 1. Column: Time
# Runden
""" 
Column Time -> runden, da Nachkommastellen bei Kochzeit irrelevant
Beschreibung: The duration a recipe should take at most (including the time reserved
for the preparation and cooking).
"""
requests_data["Time"] = requests_data["Time"].round(1)

#remove outliers    
# Calculating IQR and determining the thresholds for outliers
Q1 = np.quantile(requests_data["Time"], 0.25)
Q3 = np.quantile(requests_data["Time"], 0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 4 * IQR
upper_bound = Q3 + 4 * IQR

# Removing outliers
time_data_filtered = requests_data["Time"][(requests_data["Time"] >= lower_bound) & (requests_data["Time"] <= upper_bound)]

# Percentage of data retained after removing outliers
percentage_retained = len(time_data_filtered) / len(requests_data["Time"]) * 100


print(len(time_data_filtered), percentage_retained)
requests_data["Time"] = time_data_filtered
requests_data = requests_data.dropna(subset=["Time"])



# Visualization of the 'Time' column before and after removing outliers

"""

# Plotting the original 'Time' data
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.boxplot(requests_data["Time"])
plt.title('Original Time Data (with Outliers)')
plt.xlabel('Time')

# Plotting the 'Time' data after removing outliers
plt.subplot(1, 2, 2)
sns.boxplot(time_data_filtered)
plt.title('Time Data After Removing Outliers')
plt.xlabel('Time')

plt.tight_layout()
plt.show()

# Creating histograms
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(requests_data["Time"], bins=50, kde=True)
plt.title('Histogram of Original Time Data')
plt.xlabel('Time')
plt.xlim(left=0)  # Adjust the x-axis as needed

plt.subplot(1, 2, 2)
sns.histplot(time_data_filtered, bins=50, kde=True)
plt.title('Histogram of Time Data (Without Outliers)')
plt.xlabel('Time')
plt.xlim(left=0)  # Adjust the x-axis as needed

plt.tight_layout()
plt.show()


"""""

# 2.Teilweise negative Werte -> 0
requests_data.loc[requests_data["Time"] <= 0, "Time"] = 0
"""
Test, ob negative Werte immer False als "Like" haben -> stimmt aber nicht siehe Code:
Requests mit [AuthorId, RecipeId] time <= 0 mit reviews [AuthorId, RecipeId] joinen und dort like checken
joined_data = requests_data.merge(reviews_data_test, on=["AuthorId", "RecipeId"], how="left")
joined_data = joined_data[joined_data["Time"] <= 0]
joined_data = joined_data[~joined_data["Like"].isna()]
"""

# 3. Column: HighCalories, HighProtein, LowFat, LowSugar, HighFiber
unique_values = {
        "HighCalories": requests_data["HighCalories"].unique(),
        "HighProtein": requests_data["HighProtein"].unique(),
        "LowFat": requests_data["LowFat"].unique(),
        "LowSugar": requests_data["LowSugar"].unique(),
        "HighFiber": requests_data["HighFiber"].unique()
    }

# 4. Column: HighCalories
requests_data["HighCalories"] = requests_data["HighCalories"].astype("bool")

# 5. Column: HighProtein
"""
2 Werte: Indifferent und Yes
Daraus wird boolean indifferent = False und Yes = True
"""
requests_data.loc[requests_data["HighProtein"] == "Indifferent", "HighProtein"] = 0
requests_data.loc[requests_data["HighProtein"] == "Yes", "HighProtein"] = 1
requests_data["HighProtein"] = requests_data["HighProtein"].astype("bool")

# 6. Column: LowFat
requests_data["LowFat"] = requests_data["LowFat"].astype("bool")

# 7. Column: LowSugar
"""
2 Werte: Indifferent und 0. Interpretation: 0 -> user braucht kein low-sugar Inhalt, Indifferent -> User ist es egal
Daraus wird boolean 0 = False und indifferent = True
"""
requests_data.loc[requests_data["LowSugar"] == "0", "LowSugar"] = 0
requests_data.loc[requests_data["LowSugar"] == "Indifferent", "LowSugar"] = 1
requests_data["LowSugar"] = requests_data["LowSugar"].astype("bool")

# 8. Column HighFiber
requests_data["HighFiber"] = requests_data["HighFiber"].astype("bool")

# 9. check for duplicates (gleiche AuthorId und RecipeId)
duplicate_entries_counts = requests_data.duplicated().sum()
print("Number of duplicate entries:", duplicate_entries_counts)


# requests_data["HighFiber"].unique()
requests_data.head(20)
requests_data.describe(include="all")





# Output: alle columns sind sauber, keine missing values, keine duplicates, alle datentypen passen
# alle werte auf boolean geändert, außer Time (float) und Ids (int)
# requests_data: 50000 rows × 9 columns, 0 missing values, AuthorId, RecipeId, Time, HighCalories, HighProtein, LowFat, LowSugar, HighFiber
# time < 0 -> 0, time > 0 -> gerundet auf 1 Nachkommastelle, time outliers entfernt (4*IQR)


132993 94.8628695745212
Number of duplicate entries: 0


Unnamed: 0,AuthorId,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
count,132993,132993.0,132993.0,132993,132993,132993,132993,132993
unique,47423,,,2,2,2,2,2
top,1930181E,,,False,False,False,False,False
freq,816,,,79416,79967,93160,93108,79713
mean,,153198.59373,2996.475371,,,,,
std,,130559.843136,2729.684319,,,,,
min,,40.0,0.0,,,,,
25%,,47111.0,1200.6,,,,,
50%,,109817.0,2397.6,,,,,
75%,,233056.0,3602.0,,,,,


### 4. recipes.csv

In [5]:
recipes_data = pd.read_csv("../aufgabe/training_dataset/recipes.csv")
threshold = 3.5

# Column CookTime und Column PrepTime
"""
Outlier Detection:
Außerhalb von 3,5*Standardabweichung -> Outlier

=> Outlier werden entfernt
"""
# log transformation
recipes_data["CookTime"] = recipes_data["CookTime"].apply(lambda x: np.log(x) if x > 0 else x)
recipes_data["PrepTime"] = recipes_data["PrepTime"].apply(lambda x: np.log(x) if x > 0 else x)
# print(recipes_data["PrepTime"].describe())

# plt.hist(recipes_data['PrepTime'], bins=10)
# plt.title('Histogram of Cook Time')
# plt.xlabel('Cook Time')
# plt.ylabel('Frequency')
# plt.show()

std_CookTime = recipes_data['CookTime'].std()
mean_CookTime = recipes_data['CookTime'].mean()
upper_limit_CookTime = mean_CookTime + threshold * std_CookTime
lower_limit_CookTime = mean_CookTime - threshold * std_CookTime


std_PrepTime = recipes_data['PrepTime'].std()
mean_PrepTime = recipes_data['PrepTime'].mean()
upper_limit_PrepTime = mean_PrepTime + threshold * std_PrepTime
lower_limit_PrepTime = mean_PrepTime - threshold * std_PrepTime

recipes_data = recipes_data[(recipes_data["CookTime"] >= lower_limit_CookTime) & (recipes_data['CookTime'] <= upper_limit_CookTime)]

recipes_data = recipes_data[(recipes_data["PrepTime"] >= lower_limit_PrepTime) & (recipes_data['PrepTime'] <= upper_limit_PrepTime)]
print("Shape vorher: ", recipes_data.shape)


# Column RecipeCategory
recipes_data["RecipeCategory"] = recipes_data["RecipeCategory"].astype("category")


# Column RecipeIngredientQuantities und Column RecipeIngredientParts
# zu liste von strings umwandeln
recipes_data["RecipeIngredientQuantities"] = recipes_data["RecipeIngredientQuantities"].str.replace('character(0)', '').str.lstrip('"c("').str.replace('"', '').str.replace(")", "").str.replace('\\', '').str.split(",")
recipes_data["RecipeIngredientParts"] = recipes_data["RecipeIngredientParts"].str.replace('character(0)', '').str.lstrip('"c("').str.replace('"', '').str.replace(")", "").str.replace('\\', '').str.split(",")


# Column Calories
# Outlier weg
recipes_data["Calories"] = recipes_data["Calories"].apply(lambda x: np.log(x) if x > 0 else x)

std_Calories = recipes_data['Calories'].std()
mean_Calories = recipes_data['Calories'].mean()
upper_limit_Calories = mean_Calories + threshold * std_Calories
lower_limit_Calories = mean_Calories - threshold * std_Calories

recipes_data = recipes_data[(recipes_data["Calories"] >= lower_limit_Calories) & (recipes_data['Calories'] <= upper_limit_Calories)]


# Column FatContent
# Oulier weg

recipes_data["FatContent"] = recipes_data["FatContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_FatContent = recipes_data['FatContent'].std()
mean_FatContent = recipes_data['FatContent'].mean()
upper_limit_FatContent = mean_FatContent + threshold * std_FatContent
lower_limit_FatContent = mean_FatContent - threshold * std_FatContent

recipes_data = recipes_data[(recipes_data["FatContent"] >= lower_limit_FatContent) & (recipes_data['FatContent'] <= upper_limit_FatContent)]


# Column SaturatedFatContent
# Outlier weg
recipes_data["SaturatedFatContent"] = recipes_data["SaturatedFatContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_SaturatedFatContent = recipes_data['SaturatedFatContent'].std()
mean_SaturatedFatContent = recipes_data['SaturatedFatContent'].mean()
upper_limit_SaturatedFatContent = mean_SaturatedFatContent + threshold * std_SaturatedFatContent
lower_limit_SaturatedFatContent = mean_SaturatedFatContent - threshold * std_SaturatedFatContent

recipes_data = recipes_data[(recipes_data["SaturatedFatContent"] >= lower_limit_SaturatedFatContent) & (recipes_data['SaturatedFatContent'] <= upper_limit_SaturatedFatContent)]


# Column CholesterolContent
# Outlier weg
recipes_data["CholesterolContent"] = recipes_data["CholesterolContent"].apply(lambda x: np.log(x + 1) if x > 0 else x)

std_CholesterolContent = recipes_data['CholesterolContent'].std()
mean_CholesterolContent = recipes_data['CholesterolContent'].mean()
upper_limit_CholesterolContent = mean_CholesterolContent + threshold * std_CholesterolContent
lower_limit_CholesterolContent = mean_CholesterolContent - threshold * std_CholesterolContent

recipes_data = recipes_data[(recipes_data["CholesterolContent"] >= lower_limit_CholesterolContent) & (recipes_data['CholesterolContent'] <= upper_limit_CholesterolContent)]


# Column SodiumContent
# Outlier weg
recipes_data["SodiumContent"] = recipes_data["SodiumContent"].apply(lambda x: np.log(x + 1) if x > 0 else x)

std_SodiumContent = recipes_data['SodiumContent'].std()
mean_SodiumContent = recipes_data['SodiumContent'].mean()
upper_limit_SodiumContent = mean_SodiumContent + threshold * std_SodiumContent
lower_limit_SodiumContent = mean_SodiumContent - threshold * std_SodiumContent

recipes_data = recipes_data[(recipes_data["SodiumContent"] >= lower_limit_SodiumContent) & (recipes_data['SodiumContent'] <= upper_limit_SodiumContent)]


# Column CarbohydrateContent
# Outlier weg
recipes_data["CarbohydrateContent"] = recipes_data["CarbohydrateContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_CarbohydrateContent = recipes_data['CarbohydrateContent'].std()
mean_CarbohydrateContent = recipes_data['CarbohydrateContent'].mean()
upper_limit_CarbohydrateContent = mean_CarbohydrateContent + threshold * std_CarbohydrateContent
lower_limit_CarbohydrateContent = mean_CarbohydrateContent - threshold * std_CarbohydrateContent

recipes_data = recipes_data[(recipes_data["CarbohydrateContent"] >= lower_limit_CarbohydrateContent) & (recipes_data['CarbohydrateContent'] <= upper_limit_CarbohydrateContent)]


# Column FiberContent
# Outlier weg
recipes_data["FiberContent"] = recipes_data["FiberContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_FiberContent = recipes_data['FiberContent'].std()
mean_FiberContent = recipes_data['FiberContent'].mean()
upper_limit_FiberContent = mean_FiberContent + threshold * std_FiberContent
lower_limit_FiberContent = mean_FiberContent - threshold * std_FiberContent

recipes_data = recipes_data[(recipes_data["FiberContent"] >= lower_limit_FiberContent) & (recipes_data['FiberContent'] <= upper_limit_FiberContent)]


# Column SugarContent
# Outlier weg
recipes_data["SugarContent"] = recipes_data['SugarContent'].apply(lambda x: np.log(x+1) if x > 0 else x)

std_SugarContent = recipes_data['SugarContent'].std()
mean_SugarContent = recipes_data['SugarContent'].mean()
upper_limit_SugarContent = mean_SugarContent + threshold * std_SugarContent
lower_limit_SugarContent = mean_SugarContent - threshold * std_SugarContent

recipes_data = recipes_data[(recipes_data["SugarContent"] >= lower_limit_SugarContent) & (recipes_data['SugarContent'] <= upper_limit_SugarContent)]



# Column ProteinContent
# Outlier weg
recipes_data["ProteinContent"] = recipes_data["ProteinContent"].apply(lambda x: np.log(x+1) if x > 0 else x)

std_ProteinContent = recipes_data['ProteinContent'].std()
mean_ProteinContent = recipes_data['ProteinContent'].mean()
upper_limit_ProteinContent = mean_ProteinContent + threshold * std_ProteinContent
lower_limit_ProteinContent = mean_ProteinContent - threshold * std_ProteinContent

recipes_data = recipes_data[(recipes_data["ProteinContent"] >= lower_limit_ProteinContent) & (recipes_data['ProteinContent'] <= upper_limit_ProteinContent)]


# Column RecipeServings & Column RecipeYield
"""
RecipeServings: Anzahl der Portionen, die das Rezept ergibt
RecipeYield: Gibt an, wie viele Stücke man aus dem Rezept erhält. Ein Rezept ergibt zum Beispiel 1/2 Liter Suppe, was 2 Portionen entspricht.
-> Versuch: RecipeYield zu standardisieren z.B. in Liter, Gramm, Stück, ... -> Problem: >2000 verschiedene Einheiten (zu viele) -> RecipeYield nicht verwenden
-> Stattdessen auf RecipeServings zurückgreifen und fehlende Werte durch randomforest imputieren

Wichtig: Da RecipeServings schlecht verteilt ist (z.b. meisten Werte zwischen 0 und 10, aber auch Werte >1000) werden die Modelle ungenau
-> Lösung: Verwendung von Klassen: Einteilung in Serving-Size: small, medium, large, ...
-> Wichtig: One-hot-encoding nutzen (da es sich um Kategorien handelt) -> 3 Spalten: small, medium, large

"""

# Correlation Matrix von numerischen Werten
# corr = recipes_data[["CookTime", "PrepTime", "Calories", "FatContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent", "RecipeServings"]].corr(numeric_only=True)
# print(corr)

# RecipeYield
# recipes_data["RecipeYield_Quantity"] = recipes_data["RecipeYield"].str.split(" ").str[0]
# recipes_data["RecipeYield_Unit"] = recipes_data["RecipeYield"].str.split(" ").str[1]
recipes_data.drop(columns=["RecipeYield"], inplace=True)

# Logarithmieren von RecipeServings, um die Verteilung zu verbessern
recipes_data["RecipeServings"] = recipes_data["RecipeServings"].apply(lambda x: np.log(x) if x > 0 else x)

# plt.figure(figsize=(10, 6))
# plt.hist(recipes_data['RecipeServings'], bins=10)
# plt.title('Histogram of Recipe Servings')
# plt.xlabel('Recipe Servings')
# plt.ylabel('Frequency')
# plt.show()

# Outlier weg
std_RecipeServings = recipes_data[recipes_data['RecipeServings'].notna()]["RecipeServings"].std()
mean_RecipeServings = recipes_data[recipes_data['RecipeServings'].notna()]["RecipeServings"].mean()
# print(recipes_data['RecipeServings'].describe())
"""
mean         1.770398
std          0.781950
min          0.000000
25%          1.386294 -> 25% der Werte sind 1.386294 oder kleiner -> small
50%          1.791759 -> 50% der Werte sind 1.791759 oder kleiner -> medium
75%          2.079442 -> 75% der Werte sind 2.079442 oder kleiner -> large
max          6.907755
"""

upper_limit_RecipeServings = mean_RecipeServings + threshold * std_RecipeServings
lower_limit_RecipeServings = mean_RecipeServings - threshold * std_RecipeServings

recipes_data = recipes_data[((recipes_data["RecipeServings"] >= lower_limit_RecipeServings) & (recipes_data['RecipeServings'] <= upper_limit_RecipeServings)) | (recipes_data['RecipeServings'].isna())]
print("Shape nach Kürzung der Outlier: ", recipes_data.shape)

recipes_data_description = recipes_data.describe()
# Discretization von RecipeServings -> Klassen: small = 1, medium = 2, large = 3 -> one-hot-encoding
recipes_data["RecipeServings_small"] = recipes_data["RecipeServings"].apply(lambda x: 1 if x <= recipes_data_description.loc["25%", "RecipeServings"] else 0)
recipes_data["RecipeServings_medium"] = recipes_data["RecipeServings"].apply(lambda x: 1 if (x > recipes_data_description.loc["25%", "RecipeServings"]) & (x <= recipes_data_description.loc["75%", "RecipeServings"]) else 0)
recipes_data["RecipeServings_large"] = recipes_data["RecipeServings"].apply(lambda x: 1 if x > recipes_data_description.loc["75%", "RecipeServings"] else 0)
# print(recipes_data[['RecipeId', 'RecipeServings_small', 'RecipeServings_medium', 'RecipeServings_large']].head(10))
# print("Anzahl small, medium, large: ", recipes_data["RecipeServings_small"].sum(), recipes_data["RecipeServings_medium"].sum(), recipes_data["RecipeServings_large"].sum())

# fehlende Werte in RecipeServings durch random forest imputieren
features = ['CookTime', 'PrepTime', 'Calories', 'FatContent', 'FiberContent', 'ProteinContent']

# test_data with and without RecipeServings, training_data only with RecipeServings
known_servings = recipes_data[recipes_data['RecipeServings'].notna()]
unknown_servings = recipes_data[recipes_data['RecipeServings'].isna()]

X = known_servings[features]
y_known = known_servings['RecipeServings_small']

# print(X)
# print(y_known)
X_train, X_val, y_train, y_val = train_test_split(X, y_known, test_size=0.15, random_state=random_seed)
# print('Training Features Shape:', X_train.shape)
# print('Training Labels Shape:', y_train.shape)
# print('Validation Features Shape:', X_val.shape)
# print('Validation Labels Shape:', y_val.shape)


# Model trainieren
model = RandomForestRegressor(n_estimators=50, random_state=random_seed)
model.fit(X_train, y_train)

# Model evaluieren
y_pred = model.predict(X_val)

# Kategorisieren: Einteilung in am nächsten liegende Klasse
for y in y_pred:
    distance_small = np.absolute(recipes_data_description.loc["25%", "RecipeServings"] - y)

# Bewertung des Modells
mse = mean_squared_error(y_val, y_pred)
# calculate r squared
r_squared = model.score(X_val, y_val)
print('R Squared: ', r_squared)
print('MSE Mean Squred Error: ', mse) 


# y_val together with y_pred 
y_val_pred = pd.DataFrame({'y_val': y_val, 'y_pred': y_pred})
# Extract RecipeId for the validation set
recipe_ids_val = recipes_data.loc[y_val.index, 'RecipeId']
# Add RecipeId to the y_val_pred dataframe
y_val_pred['RecipeId'] = recipe_ids_val

# Display the dataframe
print(y_val_pred.head(20))


# Model anwenden um fehlende Daten zu analysieren
# X_unknown = unknown_servings[features]
# y_unknown = unknown_servings["RecipeServings"]
# y_unknown_pred = model.predict(X_unknown)

# print(unknown_servings.head())
# print(y_unknown_pred.head())




# recipes_data["RecipeCategory"].unique()
# recipes_data.head()
# recipes_data.dtypes






Shape vorher:  (73253, 18)
Shape nach Kürzung der Outlier:  (72073, 17)
R Squared:  0.2908043238074334
MSE Mean Squred Error:  0.17361849290447004
       y_val  y_pred  RecipeId
23717      1    1.00    446595
24753      0    0.92     26389
32151      0    0.24    117789
21265      1    0.92     42256
9118       0    0.42    238978
43051      1    0.54    230105
68551      0    0.30      4174
15959      0    0.28     46544
6868       0    0.20    334664
59608      0    0.14       361
51736      0    0.34    127257
71585      0    0.24    361751
46800      0    0.32    280855
70073      1    0.54    414221
61243      1    0.28    327645
73773      0    0.24    179599
32079      1    0.54    132565
45637      0    0.14     79708
17698      1    0.10    277619
10983      1    0.62    130653


## II. Feature Engineering


## III. Data Integration

In [84]:
# Laden der Datensätze


"""diet_df = pd.read_csv('../aufgabe/training_dataset/diet.csv')
recipes_df = pd.read_csv('../aufgabe/training_dataset/recipes.csv')
requests_df = pd.read_csv('../aufgabe/training_dataset/requests.csv')
reviews_df = pd.read_csv('../aufgabe/training_dataset/reviews.csv')"""


diet_df = users_diet 
recipes_df = recipes_data
requests_df = requests_data
reviews_df = reviews_data
print (requests_data.describe(include="all"))

# Anzeigen der ersten Reihen jedes Dataframes, um ihre Struktur zu verstehen
diet_df.head(), recipes_df.head(), requests_df.head(), reviews_df.head()

# Verbinden von reviews_df mit diet_df über AuthorId
combined_df1 = pd.merge(reviews_df, diet_df, on='AuthorId', how='left')

# Verbinden von combined_df1 mit requests_df über AuthorId und RecipeId
combined_df2 = pd.merge(combined_df1, requests_df, on=['AuthorId', 'RecipeId'], how='left')

# Verbinden von combined_df2 mit recipes_df über RecipeId
final_combined_df = pd.merge(combined_df2, recipes_df, on='RecipeId', how='left')

# Anzeigen der ersten Reihen des endgültigen integrierten Datensatzes
final_combined_df


csv_file_path = 'final_table.csv'

# Save the DataFrame to CSV
final_combined_df.to_csv(csv_file_path, index=False)

# Returning the file path for download
print (csv_file_path)



        AuthorId       RecipeId           Time HighCalories HighProtein  \
count     132993  132993.000000  132993.000000       132993      132993   
unique     47423            NaN            NaN            2           2   
top     1930181E            NaN            NaN        False       False   
freq         816            NaN            NaN        79416       79967   
mean         NaN  153198.593730    2996.475371          NaN         NaN   
std          NaN  130559.843136    2729.684319          NaN         NaN   
min          NaN      40.000000       0.000000          NaN         NaN   
25%          NaN   47111.000000    1200.600000          NaN         NaN   
50%          NaN  109817.000000    2397.600000          NaN         NaN   
75%          NaN  233056.000000    3602.000000          NaN         NaN   
max          NaN  541195.000000   16019.200000          NaN         NaN   

        LowFat LowSugar HighFiber  
count   132993   132993    132993  
unique       2        2    