In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

df = pd.read_csv('/instacart.csv')

In [2]:
from sklearn.impute import SimpleImputer

# Data Preparation
X = df[['user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]
y = df['reordered']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values for training set
imputer = SimpleImputer(strategy="median")
imputer.fit(X_train)

# Transform both training and test sets
X_train_imputed = pd.DataFrame(imputer.transform(X_train), columns=X.columns)
X_test_imputed = pd.DataFrame(imputer.transform(X_test), columns=X.columns)

# Model Training
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier()
}

best_accuracy = 0
best_model = None

for name, model in models.items():
    model.fit(X_train_imputed, y_train)
    y_pred = model.predict(X_test_imputed)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {name}: {acc}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

# Recommendations and Best Time for All Users
all_users = df['user_id'].unique()
results = []

for user in all_users:
    user_data = df[df['user_id'] == user]
    user_X = user_data[['user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]
    
    # Impute missing values for new data
    user_X_imputed = pd.DataFrame(imputer.transform(user_X), columns=user_X.columns)
    
    # Make predictions
    predictions = best_model.predict(user_X_imputed)
    
    # Top 3 Products
    recommended_products = user_data[predictions == 1]
    top_3_products = recommended_products['product_id'].value_counts().index[:3]
    
    for product in top_3_products:
        product_data = user_data[user_data['product_id'] == product]
        
        # Best Time and Day of Week
        best_dow = product_data['order_dow'].mode()[0]
        best_hour = product_data['order_hour_of_day'].mode()[0]
        
        results.append([user, product, best_dow, best_hour])

# Save to CSV
top_3_rec_df = pd.DataFrame(results, columns=['user_id', 'recommended_product_id', 'best_dow', 'best_hour'])

# Writing to a specific path with a specific file name
output_path = 'path'
output_filename = 'top_3_rec_time_week.csv'
full_output_path = output_path + output_filename

top_3_rec_df.to_csv(full_output_path, index=False)



Accuracy for Decision Tree: 0.6407522255907976
Accuracy for Logistic Regression: 0.6873860666351066
Accuracy for AdaBoost: 0.7018783167268446
