# Exploratory Data Analysis and Machine Learning Model for Starbucks Nutritional Data
This Jupyter notebook presents an exploratory data analysis (EDA) and a machine learning model for predicting the calorie content of Starbucks menu items. The data used in this notebook is a CSV file named 'starbucks.csv'.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

data = pd.read_csv('/kaggle/input/starbucks-nutrition/starbucks.csv')

In [None]:
print(data.head()) # Display the first few rows of the data

print(data.describe()) # Summary statistics

print(data.isnull().sum()) # Check for missing values

# Data Visualization


In [None]:
# Plot the distribution of calories
plt.figure(figsize=(10,6))
sns.histplot(data['calories'], bins=30, color='green')
plt.title('Distribution of Calories')
plt.xlabel('Calories')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Boxplot of calories by type
plt.figure(figsize=(10,6))
sns.boxplot(x='type', y='calories', data=data)
plt.title('Calories by Type')
plt.xlabel('Type')
plt.ylabel('Calories')
plt.show()


In [None]:
columns_to_exclude = ["Unnamed: 0", "type"]
data_subset = data.drop(columns=columns_to_exclude)
corr = data_subset.corr()

# Correlation Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(corr, annot=True, cmap='Greens')
plt.title('Correlation Matrix')
plt.show()


# Machine Learning
The data is split into a training set and a testing set. A machine learning model is defined using scikit-learn's Pipeline. This model consists of the preprocessing pipeline and a random forest regressor.

The model is trained on the training data and used to make predictions on the testing data.

## MAE - exact calorie mean error

In [None]:
# Remove unnecessary columns
data = data.drop(columns=['item'])

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['fat', 'carb', 'fiber', 'protein']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['type'])
    ])

# Train test split because I tried others and this is the best result due to small sample size 
X = data.drop('calories', axis=1)
y = data['calories']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# Define
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor())])

# Train+prediction
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Calculate MSE, RMSE, MAE
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")