In [1]:
import pandas as pd

In [2]:
wastageDF = pd.read_csv('../data/food_wastage_data.csv')

In [3]:
wastageDF.head(n=10)

Unnamed: 0,Type of Food,Number of Guests,Event Type,Quantity of Food,Storage Conditions,Purchase History,Seasonality,Preparation Method,Geographical Location,Pricing,Wastage Food Amount
0,Meat,310,Corporate,450,Refrigerated,Regular,All Seasons,Buffet,Urban,Low,25
1,Meat,400,Birthday,500,Room Temperature,Regular,Winter,Buffet,Suburban,High,40
2,Vegetables,302,Birthday,371,Refrigerated,Regular,Summer,Buffet,Suburban,Low,27
3,Meat,491,Birthday,497,Refrigerated,Regular,All Seasons,Finger Food,Rural,High,32
4,Meat,300,Corporate,400,Refrigerated,Regular,Winter,Finger Food,Urban,Moderate,25
5,Meat,302,Birthday,371,Refrigerated,Regular,Summer,Buffet,Suburban,Low,27
6,Meat,240,Wedding,450,Refrigerated,Regular,Winter,Sit-down Dinner,Urban,Low,20
7,Meat,300,Wedding,400,Refrigerated,Regular,Summer,Finger Food,Rural,Moderate,25
8,Fruits,320,Corporate,400,Refrigerated,Regular,All Seasons,Buffet,Urban,High,45
9,Meat,400,Corporate,500,Refrigerated,Occasional,Winter,Finger Food,Urban,High,40


In [15]:
wastageDF['Seasonality'].unique()

array([0, 2, 1])

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [5]:
wastageDF.isnull().sum()

Type of Food             0
Number of Guests         0
Event Type               0
Quantity of Food         0
Storage Conditions       0
Purchase History         0
Seasonality              0
Preparation Method       0
Geographical Location    0
Pricing                  0
Wastage Food Amount      0
dtype: int64

In [8]:
# Encode categorical variables
label_encoders = {}
categorical_columns = ['Type of Food', 'Event Type', 'Storage Conditions', 'Purchase History', 'Seasonality', 'Preparation Method', 'Geographical Location', 'Pricing']

In [9]:
for col in categorical_columns:
    le = LabelEncoder()
    wastageDF[col] = le.fit_transform(wastageDF[col])
    label_encoders[col] = le

In [13]:
features = ['Type of Food', 'Number of Guests', 'Event Type', 'Quantity of Food', 'Storage Conditions', 'Purchase History', 'Seasonality', 'Preparation Method', 'Geographical Location', 'Pricing']

In [14]:
# Define features and target
target = 'Wastage Food Amount'

In [12]:
X = wastageDF[features]
y = wastageDF[target]

In [27]:
# Normalize numerical features
scaler = StandardScaler()
X[['Number of Guests', 'Quantity of Food']] = scaler.fit_transform(X[['Number of Guests', 'Quantity of Food']])

# Display the preprocessed data
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[['Number of Guests', 'Quantity of Food']] = scaler.fit_transform(X[['Number of Guests', 'Quantity of Food']])


Unnamed: 0,Type of Food,Number of Guests,Event Type,Quantity of Food,Storage Conditions,Purchase History,Seasonality,Preparation Method,Geographical Location,Pricing
0,3,-0.115096,1,0.596356,0,1,0,0,2,1
1,3,1.21213,0,1.363388,1,1,2,0,1,0
2,4,-0.233071,0,-0.615553,0,1,1,0,1,1
3,3,2.554103,0,1.317366,0,1,0,1,0,0
4,3,-0.262565,1,-0.170675,0,1,2,1,2,2


In [28]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [30]:
# Train the model
model.fit(X_train, y_train)

In [31]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the testing data
y_pred = model.predict(X_test)

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 7.44385490584351
R-squared: 0.9281881783872926


In [32]:
# Example new data
new_data = pd.DataFrame({
    'Type of Food': [label_encoders['Type of Food'].transform(['Meat'])],  # Replace with actual type of food
    'Number of Guests': [300],  # Replace with actual number of guests
    'Event Type': [label_encoders['Event Type'].transform(['Corporate'])],  # Replace with actual event type
    'Quantity of Food': [400],  # Replace with actual quantity of food
    'Storage Conditions': [label_encoders['Storage Conditions'].transform(['Refrigerated'])],  # Replace with actual storage condition
    'Purchase History': [label_encoders['Purchase History'].transform(['Regular'])],  # Replace with actual purchase history
    'Seasonality': [label_encoders['Seasonality'].transform(['Winter'])],  # Replace with actual seasonality
    'Preparation Method': [label_encoders['Preparation Method'].transform(['Buffet'])],  # Replace with actual preparation method
    'Geographical Location': [label_encoders['Geographical Location'].transform(['Urban'])],  # Replace with actual geographical location
    'Pricing': [label_encoders['Pricing'].transform(['Low'])]  # Replace with actual pricing
})

# Normalize new data
new_data[['Number of Guests', 'Quantity of Food']] = scaler.transform(new_data[['Number of Guests', 'Quantity of Food']])

# Make predictions
new_predictions = model.predict(new_data)

print(f'Predicted Wastage Food Amount: {new_predictions}')

Predicted Wastage Food Amount: [37.2]


In [34]:
import pickle

# Save the model to a pickle file
with open('../api/model/wastage_food_model.pkl', 'wb') as file:
    pickle.dump(model, file)