### Analytics Cup 2024 - Max Script

#### **Set up**

In [None]:
# Installing the Packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from IPython.display import display, HTML


In [None]:
# Setting the Seed (pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
seed = 2024
np.random.seed(seed)

In [None]:
# Importing the data 

# Importing the data 
diet_df = pd.read_csv("diet.csv")
recipes_df = pd.read_csv("recipes.csv")
requests_df = pd.read_csv("requests.csv")
reviews_df = pd.read_csv("reviews.csv")

#### **Data Understanding and Cleaning**

In [None]:
# General Notes: 
# One diet per AuthorId
# One recipe per RecipeId 
# Several requests per author (But only one request per author per recipe) 
# Several reviews per author (But only one review per author per recipe) 

# 1) Merged diets and reviews -> Dataset with reviews and diet information 
# 2) Merged (diets & reviews) with requests on AuthorId and RecipeId


#### Diet Understanding ✅

In [None]:
# Get overview of the diet dataset.
print(diet_df.head())
print()
print(diet_df.info())
print()
print(diet_df.isnull().sum()) # --> 1 missing value in the "Diet" column
print()

# Row with the missing value: 
print(diet_df[diet_df.isnull().any(axis=1)])
# --> AuthorId: 5, Diet: NaN


In [None]:
# Describing the diet dataset
print(diet_df.describe())



# Create the boxplot for the age column
sns.boxplot(y=diet_df["Age"], color="skyblue")  # Set the color
plt.title("Age Boxplot")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()

# Create a barplot for the diet column
ax = sns.countplot(x='Diet', data=diet_df)
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2, p.get_height()), ha='center', va='bottom')
plt.show()


In [None]:
# Show how many unique Authors there are in the diet dataset
print(diet_df['AuthorId'].nunique()) # --> 271907 unique authors --> Every row is a unique author
print(diet_df['AuthorId'].duplicated().sum())

In [None]:
# General Insights: 
# - 3 Attirubtes
# - In the diet column we have 1 row with a null value 


#### Diet Cleaning ✅

In [None]:
# Drop the row in the diet dataset with the missing value. 
diet_df = diet_df.dropna() # Potential implications: When merging we loose a row of data. 

# Make the "Diet" column of type category & dummy variable encode it.
diet_cleaned = pd.get_dummies(diet_df, columns=['Diet'], drop_first=True)

diet_cleaned



#### Recipes Understanding 🚧

In [None]:
# General Recipe Overview
display(recipes_df.head())
print()
# print(recipes_df.info())
#print()
print(recipes_df.isnull().sum()) # --> No missing values in the ProteinContent column, 26713 missing values in the RecipeServings column, 50295 missing values in the RecipeYield column

In [None]:
# Describing the Cook Time

# Missing values in the cooktime column
print(recipes_df[recipes_df['CookTime'].isnull()]) # --> 0 missing values in the CookTime column

# Describe the cooktime column
print(recipes_df['CookTime'].describe())

# Boxplot of Cooktime 
sns.boxplot(y=recipes_df["CookTime"], color="skyblue") # The values are very far apart


# Problems: 
# - Values incredibly far apart and don't make sense.
# - Probably in seconds? 

# What to do? 
# - Delete abnormalities 
# - Normalize it? 

In [None]:
# Describing the Prep Time

# Missing values in the PrepTime column
print(recipes_df[recipes_df['PrepTime'].isnull()]) # --> 0 missing values in the PrepTime column

# Describe the PrepTime column
print(recipes_df['PrepTime'].describe())

# Boxplot of PrepTime
sns.boxplot(y=recipes_df["PrepTime"], color="skyblue") # Their are outliers. But most values around the bottom. 

# Problems: 
# - Values incredibly far apart and don't make sense.
# - Probably in seconds? 

# What to do? 
# - Delete outliers 
# - Normalize it? 

In [None]:
# Describing the Recipe Category

# Missing values in the RecipeCategory column
print(recipes_df[recipes_df['RecipeCategory'].isnull()]) # --> 0 missing values in the RecipeCategory column

# Number of values in the RecipeCategory column
print(recipes_df['RecipeCategory'].value_counts()) 
print(sns.countplot(x='RecipeCategory', data=recipes_df)); # 7 --> unique Recipe Categoreis (most of them in other)

# Problems: 
# - Recipes in the Category "Other" don't tell us much
# - We also don't have "Dinner". 

# What to do?
# - Somehow split up the other column 
# - Delete the other column
# - This column with the requests and see what categories could make sense. 


In [None]:
# Describing the "RecipeIngredientQuantities" column

# Missing values in the RecipeIngredientQuantities column
print(recipes_df[recipes_df['RecipeIngredientQuantities'].isnull()]) # --> 0 missing values in the RecipeIngredientQuantities column

# Number of values in the RecipeIngredientQuantities column
print(recipes_df['RecipeIngredientQuantities'].value_counts()) # --> 1 --> unique RecipeIngredientQuantities

# Insights: 
# - Not all values are unique (e.g. "\"1\"", "\"1\"", "\"1\"" exists 211 times)"


# Problems:

# What to do?






#### Recipe Cleaning 🚧

In [None]:
recipes_cleaned = recipes_df

In [None]:
# Handling (Dropping) Name Column

recipes_cleaned = recipes_df.drop(columns=['Name'])
recipes_cleaned

In [None]:
# Handling CookTime and PrepTime Column

# TODO: Handle outliers


In [None]:
# Hanndle RecipeCategory Column
recipes_cleaned = pd.get_dummies(recipes_cleaned, columns=['RecipeCategory'], drop_first=True)


In [None]:
# Handling RecipeIngredientQuantities Column (TEMPORARY -> TODO: FIX)

recipes_cleaned = recipes_cleaned.drop(columns=['RecipeIngredientQuantities'])


In [None]:
# Handling RecipeIngredientParts Column (TEMPORARY -> TODO: FIX)

recipes_cleaned = recipes_cleaned.drop(columns=['RecipeIngredientParts'])

In [None]:
# Handle Nutritional Facts Columns 

In [None]:
# Handle RecipeServings Column (Temporary: TODO: FIX)

recipes_cleaned = recipes_cleaned.drop(columns=['RecipeServings'])

In [None]:
# Handle RecipeYield Column (Temporary: TODO: FIX)

recipes_cleaned = recipes_cleaned.drop(columns=['RecipeYield'])

In [None]:
recipes_cleaned

#### Requests Understanding

In [None]:
# General Request Insights: 
# - No missing values 

# - 90847 duplicate authors --> More than one request per author
# - 0 duplicate AuthorID + RecipeID combinations --> Every author has only one request per recipe


In [None]:
# General Request Overview
display(requests_df.head())
print()
# print(requests_df.isnull().sum()) # --> No missing values

In [None]:
# Describing the numerical columns
requests_df.describe()
requests_cleaned = requests_df

In [None]:
# Find duplicate Authors
print(requests_df['AuthorId'].duplicated().sum()) # --> 90847 duplicate authors --> More than one request per author

print(requests_df.duplicated(subset=['AuthorId', 'RecipeId']).sum()) # 0 duplicate AuthorID + RecipeID combinations --> Every author has only one request per recipe


#### Requests Cleaning

In [None]:
# Handling the "Time" Column
requests_cleaned["Time"] = requests_df["Time"].round().astype(int)



In [None]:
# Handling the HighCalories Column 
requests_cleaned['HighCalories'] = requests_df['HighCalories'].astype('int')



In [None]:
# Handling the HighProtein Column (1: Yes High Protein, 0: I don't care)
requests_cleaned['HighProtein'] = requests_df['HighProtein'].map({'Yes': 1, 'Indifferent': 0})

In [None]:
# Handling the LowFat Column (Nothing)

In [None]:
# Handling the LowSugar Column (1: Yes low sugar, 0: I don't give a shit)
requests_cleaned['LowSugar'] = requests_df['LowSugar'].map({'0': 1, 'Indifferent': 0})

In [None]:
# Handling the HighFiber Column (Nothing)


In [None]:
requests_cleaned.head()


#### Reviews Understanding

In [None]:
# General Reviews Overview: 
display(reviews_df.head())
print()
print(reviews_df.isnull().sum()) # --> Missing values in columns, "Rating", "Like" and "TestSetId"

In [None]:
# Find duplicate Reviews
print(reviews_df['AuthorId'].duplicated().sum()) # --> 90847 duplicate authors --> More than one review per author
print(requests_df.duplicated(subset=['AuthorId', 'RecipeId']).sum()) # 0 duplicate AuthorID + ReviewId combinations --> Every author has only one review per recipe

In [None]:
# Number of unique values in the "Rating" column
print(reviews_df["Rating"].value_counts()) # We either haven NaNs or Rating: 2.0

In [None]:
# Number of unique values in the "Like" column: 
print(reviews_df['Like'].value_counts()); # --> make true = 1 and false = 0

In [None]:
# Splitting the Dataset into Modeling and Submission

reviews_df_cleaned = reviews_df

# reviews_df

# reviews_df_submission = reviews_df[reviews_df["Like"].isnull()]

# reviews_df_modelling = reviews_df[reviews_df["Like"].notnull()]

# What's the deal here? 
# - reviews_df -> 140195 rows 
# - reviews_df_submission -> 42814 rows 
# - reviews_df_modelling -> 97381 rows97381 rows 

# Was hat Marcel mit der TestSetId gemacht?

# print(reviews_df.shape[0])
# print(reviews_df_submission.shape[0])
# print(reviews_df_modelling.shape[0])

#### Reviews Cleaning

In [None]:
# Handling "Rating" Column (dropping it)

reviews_df_cleaned = reviews_df_cleaned.drop(columns=['Rating'])



In [None]:
# Handling "Like" Column (1: True, 0: False)
reviews_df_cleaned['Like'] = reviews_df_cleaned['Like'].map({True: 1, False: 0})

In [None]:
# Handling "TestSetId" Column (Dropping it)

#reviews_df_cleaned = reviews_df_cleaned.drop(columns=['TestSetId'])

In [None]:
reviews_df_cleaned

#### **Data Merging**

In [None]:
full_df = pd.merge(reviews_df_cleaned, diet_cleaned, on="AuthorId")
full_df = pd.merge(full_df, requests_cleaned, on=["AuthorId", "RecipeId"])
full_df = pd.merge(full_df, recipes_cleaned, on="RecipeId")

full_df

In [None]:
# Splitting the Dataset into Modeling and Submission

full_df_submission = full_df[full_df["Like"].isnull()]
full_df_modelling = full_df[full_df["Like"].notnull()]


## Modelling

In [None]:
# Logistic Regression: 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score

In [None]:
# Splliting the data into training and testing data 

X = full_df_modelling.drop(["AuthorId", "RecipeId", "Like", "TestSetId"], axis=1)
y = full_df_modelling["Like"]

X_train, X_test, y_train, y_test = \
  train_test_split(X, y,
                   test_size=0.3, 
                   shuffle=True,
                   random_state=seed)

# Modeling 

logreg_model = LogisticRegression(max_iter=30)
logreg_model.fit(X_train, y_train)



# Predicting

predictions = logreg_model.predict(X_test)

count_1s = np.count_nonzero(predictions == 1)
count_0s = np.count_nonzero(predictions == 0)

print(f"Number of 1s: {count_1s}")
print(f"Number of 0s: {count_0s}")


# Evaluating the model 

print(balanced_accuracy_score(y_test, predictions))

# Random Forest: 

from sklearn.ensemble import RandomForestClassifier


# Fit Random Forest Classifier model
rf_model = RandomForestClassifier(n_estimators=5, random_state=seed)
rf_model.fit(X_train, y_train)

# Make predictions
predictions = rf_model.predict(X_test)

# Evaluate model
print("confusion matrix")
print(confusion_matrix(y_test, predictions))
print("classification report")
print(classification_report(y_test, predictions))

# Calculate balanced accuracy
print("balanced accuracy score")
print(balanced_accuracy_score(y_test, predictions))

count_1s = np.count_nonzero(predictions == 1)
count_0s = np.count_nonzero(predictions == 0)

print()
print(f"Number of 1s: {count_1s}")
print(f"Number of 0s: {count_0s}")



### Finding best Classifier

In [None]:
#   1. LogisticRegression --> scale or increase number of iterations!!
#   2. RandomForestClassifier
#   3. GradientBoostingClassifier
    
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

model_logistic_regression = LogisticRegression(max_iter=10)
model_random_forest = RandomForestClassifier()
model_gradient_boosting = GradientBoostingClassifier()

# train the models
pipeline = Pipeline(steps=[("model", None)])

#parameter_grid_preprocessing = {
#  "pca__n_components" : [1, 2, 3, 4],
#}

parameter_grid_logistic_regression = {
  "model" : [model_logistic_regression],
  "model__C" : [0.1, 1, 10],  # inverse regularization strength
}

parameter_grid_gradient_boosting = {
  "model" : [model_gradient_boosting],
  "model__n_estimators" : [10, 20, 30, 50]
}

parameter_grid_random_forest = {
  "model" : [model_random_forest],
  "model__n_estimators" : [1, 2, 3, 4, 5, 20, 50, 80],  # number of max trees in the forest
  "model__max_depth" : [None, 50, 100],
}

meta_parameter_grid = [#parameter_grid_logistic_regression,
                       parameter_grid_random_forest
                       ,parameter_grid_gradient_boosting
]

#meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
#                       for model_grid in meta_parameter_grid]

search = GridSearchCV(pipeline,
                      meta_parameter_grid, 
                      scoring="balanced_accuracy",
                      n_jobs=2, 
                      cv=5,  # number of folds for cross-validation 
                      error_score="raise"
)
# here, the actual training and grid search happens
search.fit(X_train, y_train.values.ravel())

print("best parameter:", search.best_params_ ,"(CV score=%0.3f)" % search.best_score_)

In [None]:
# for a detailed look on the performance of the different models
def get_search_score_overview():
  for c,s in zip(search.cv_results_["params"],search.cv_results_["mean_test_score"]):
      print(c, s)

print(get_search_score_overview())

### Interpretability

In [None]:
import shap
# assume random forest model
model = RandomForestClassifier(n_estimators=5, random_state=seed)
model.fit(X_train, y_train.values.ravel())

# compute shapley values
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
shap_interaction_values = explainer.shap_interaction_values(X_train)

expected_value = explainer.expected_value
print(expected_value)

In [None]:
# class dependent plots of shapley values for each feature
for i,c in enumerate(full_df_modelling.variety.unique()):
    shap.summary_plot(shap_values[i], X_train, show=False)
    plt.title("Shapley values for "+str(c))
    plt.show()

## Generating the output

In [None]:
import numpy as np

submission_predictions = rf_model.predict(full_df_submission.drop(["AuthorId", "RecipeId", "Like", "TestSetId"], axis=1))



In [None]:
output = pd.DataFrame({'id': full_df_submission.TestSetId.astype(int), 'prediction': submission_predictions.astype(int)})

output = output.sort_values('id')
#output = output.reindex(columns=["id", "prediction"])

output.to_csv('../predictions_analytics_acrobots_1.csv', index=False)



