# Team Name: **Analytics Acrobats**

### Submission: **1**

## Setup

In [None]:
# Import required packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Set seed to ensure reproducibility

seed = 2024
np.random.seed(seed)

In [None]:
# Load datasets

diet = pd.read_csv('diet.csv')
requests = pd.read_csv('requests.csv')
reviews = pd.read_csv('reviews.csv')
recipes = pd.read_csv('recipes.csv')

## Data Understanding & Cleaning

### diet.csv

#### Overview

In [None]:
# Get overview of the diet dataset

print(diet.head())
print(diet.info())
print(diet.isnull().sum())

# --> One missing value in "Diet" column

In [None]:
# Descriptives for diet dataset

print(diet.describe())

# boxplot for age
print(sns.boxplot(diet));

# barplot for diet

print(sns.countplot(x='Diet', data=diet));

In [None]:
# Show rows with missing values in diet dataset

print(diet[diet.isnull().any(axis=1)])

# what does this user do in the other tables? -> user has no requests or reviews

#### Handle Missing Values

In [None]:
# Since there is only one row, drop rows with missing values in diet dataset

diet = diet.dropna()

#### Fix data types

In [None]:
# Make column "Diet" of type category

diet['Diet'] = diet['Diet'].astype('category')

#### Convert categorical to numeric

In [None]:
# Use dummy variable encoding for "Diet" column

diet = pd.get_dummies(diet, columns=['Diet'], drop_first=True)

In [None]:
diet

### recipes.csv

#### Overview

In [None]:
# Get overview of the recipes dataset

print(recipes.head())
print(recipes.info())
print(recipes.isnull().sum())

# --> Missing values in columns "RecipeServings" and "RecipeYield"

In [None]:
# Descriptives for recipes dataset

print(recipes.describe())

# countplot for RecipeCategory
print(recipes['RecipeCategory'].value_counts())

print(sns.countplot(x='RecipeCategory', data=recipes));

#### Handle name column

In [None]:
# Drop the column for now

recipes = recipes.drop(columns=['Name'])

# TO DO: Do we need the column?

#### Handle CookTime and PrepTime column

In [None]:
# No missing values in both columns

# TO DO: Handle outliers

#### Handle RecipeCategory column

In [None]:
# Dummy encode RecipeCategory column

recipes = pd.get_dummies(recipes, columns=['RecipeCategory'], drop_first=True)

#### Handle RecipeIngredientQuantities and RecipeIngredientParts column

In [None]:
# Just drop the columns for now

recipes = recipes.drop(['RecipeIngredientQuantities', 'RecipeIngredientParts'], axis=1)

# TO DO: Handle them better

#### Handle all nutrition fact columns

In [None]:
# TO DO: Handle potential outliers

#### Handle RecipeServings and RecipeYield column

In [None]:
# Just drop the columns for now

recipes = recipes.drop(['RecipeServings', 'RecipeYield'], axis=1)

# TO DO: Handle missing values, outliers and weird values in RecipeYield

In [None]:
recipes

### reviews.csv

#### Overview

In [None]:
# Get overview of the reviews dataset

print(reviews.head())
print(reviews.info())
print(reviews.isnull().sum())

# --> Missing values in columns "Rating", "Like" and "TestSetId"

In [None]:
# Descriptives for reviews dataset

print(reviews.describe())

print(reviews['Rating'].value_counts());

print(reviews['Like'].value_counts()); # --> make true = 1 and false = 0

#### Split datasets in prediction and use

In [None]:
# Split data into submission that needs to be predicted and the rest we can use

# 1. Get the rows without missing values in "TestSetId" column 
# and missing values in "Like" column

reviews_to_predict = reviews[reviews['TestSetId'].notnull() & reviews['Like'].isnull()]

# 2. Get the rows with missing values in "TestSetId" column

reviews_to_use = reviews[reviews['TestSetId'].isnull()]

In [None]:
reviews_to_use

#### Remove unneccesary columns

In [None]:
# Remove column "Rating" since it has either no value or only 2.0 -> no information
# Remove column "TestSetId" since only NA values

reviews_to_use = reviews_to_use.drop(['Rating', 'TestSetId'], axis=1)

#### Change type of column Like to binary

In [None]:
# make column "Like" binary, true = 1 and false = 0

reviews_to_use['Like'] = reviews_to_use['Like'].astype('int')

In [None]:
reviews_to_use

### requests.csv

#### Overview

In [None]:
# Get overview of the request dataset

print(requests.head())
print(requests.info())
print(requests.isnull().sum())

# --> No missing values

#### Change types of columns with flags

In [None]:
requests['HighCalories'] = requests['HighCalories'].astype('int')

# In "HighProtein", encode yes = 1 and indifferent = 0
requests['HighProtein'] = requests['HighProtein'].map({'Yes': 1, 'Indifferent': 0})

# In "LowSugar", encode no = 1 and indifferent = 0
requests['LowSugar'] = requests['LowSugar'].map({'0': 1, 'Indifferent': 0})

#### Round time column

In [None]:
# Round time column to integer values and cast to int

requests['Time'] = requests['Time'].round().astype('int')

### Merge datasets

In [None]:
# show the differences in columns "AuthorId / RecipeId" in reviews and requests dataset

#print(set(requests['AuthorId']) - set(reviews_to_use['AuthorId']))
#print(set(requests['RecipeId']) - set(reviews_to_use['RecipeId']))

# -> request has the exact same keys as reviews 
# -> merge on reviews_to_use where matches occur

#### Join requests and reviews_to_use

In [None]:
# Join requests and reviews_to_use on AuthorId and RecipeId

requests_reviews = pd.merge(requests, reviews_to_use, on=['AuthorId', 'RecipeId'])

#### Join requests_reviews and diet

In [None]:
# Join requests_reviews and diet on AuthorId

requests_reviews_diet = pd.merge(requests_reviews, diet, on='AuthorId')

#### Join requests_reviews_diet and recipes

In [None]:
# Join requests_reviews_diet and recipes on RecipeId

requests_reviews_diet_recipes = pd.merge(requests_reviews_diet, recipes, on='RecipeId')

In [None]:
requests_reviews_diet_recipes

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import balanced_accuracy_score

# Split data into train and test set

#X = requests_reviews_diet_recipes.drop(['AuthorId', 'RecipeId', 'HighCalories', 'HighProtein', 'LowSugar', 'Like'], axis=1)
X = requests_reviews_diet_recipes.drop(['AuthorId', 'RecipeId', 'Like'], axis=1)
y = requests_reviews_diet_recipes['Like']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

# Fit logistic regression model

logmodel = LogisticRegression(max_iter=1000)
logmodel.fit(X_train, y_train)

# Make predictions

predictions = logmodel.predict(X_test)

# Evaluate model

print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

# Calculate balanced accuracy

print(balanced_accuracy_score(y_test, predictions))