In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.impute import SimpleImputer


df = pd.read_csv('/instacart.csv')
df = df.dropna(subset=['order_dow', 'order_hour_of_day', 'department_id'])
df.drop(['order_id', 'user_id', 'product_id', 'product_name', 'eval_set', 'department', 'aisle', 'aisle_id'], axis=1, inplace=True)

X = df.drop(['order_dow', 'order_hour_of_day'], axis=1)
y = df[['order_dow', 'order_hour_of_day']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an imputer object with a mean filling strategy
imputer = SimpleImputer(strategy='mean')

# Fit the imputer using the training data and transform both the training data and full dataset
imputer.fit(X_train)
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)
X_imputed = imputer.transform(X)

# # Remove rows with missing values
# X_train.dropna(inplace=True)
# y_train = y_train.loc[X_train.index]
# X_test.dropna(inplace=True)
# y_test = y_test.loc[X_test.index]

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# XGBoost
xg_reg = xgb.XGBRegressor()
xg_reg.fit(X_train, y_train)
xg_pred = xg_reg.predict(X_test)

lr_mse = mean_squared_error(y_test, lr_pred, multioutput='uniform_average')
rf_mse = mean_squared_error(y_test, rf_pred, multioutput='uniform_average')
xg_mse = mean_squared_error(y_test, xg_pred, multioutput='uniform_average')

print(f'Linear Regression MSE: {lr_mse}')
print(f'Random Forest MSE: {rf_mse}')
print(f'XGBoost MSE: {xg_mse}')

best_model_mse = min(lr_mse, rf_mse, xg_mse)

if best_model_mse == lr_mse:
    best_model = lr
elif best_model_mse == rf_mse:
    best_model = rf
else:
    best_model = xg_reg

# Run the best model to make predictions
# Remove rows with missing values from the full dataset
X.dropna(inplace=True)
y = y.loc[X.index]
predictions = best_model.predict(X_imputed)

Linear Regression MSE: 10.77372270663572
Random Forest MSE: 13.48993661807772
XGBoost MSE: 14.073575342400153


In [34]:
df['predicted_order_dow'], df['predicted_order_hour_of_day'] = predictions[:, 0], predictions[:, 1]

best_time_df = df.groupby('department_id')[['predicted_order_dow', 'predicted_order_hour_of_day']].agg(lambda x: x.value_counts().index[0])
best_time_df.reset_index(inplace=True)

# Writing to a specific path with a specific file name
output_path = 'path'
output_filename = 'best_time_to_shop_by_department.csv'
full_output_path = output_path + output_filename

best_time_df.to_csv(full_output_path, index=False)
