In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer

df = pd.read_csv('instacart.csv')



In [4]:
# Assuming df is your DataFrame
# Data Preparation
X = df[['user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]
y = df['reordered']

# Impute NaN values
imputer = SimpleImputer(strategy="median")
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Model Training
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Logistic Regression': LogisticRegression(),
    'AdaBoost': AdaBoostClassifier()
}

best_accuracy = 0
best_model = None

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {name}: {acc}")
    
    if acc > best_accuracy:
        best_accuracy = acc
        best_model = model

# Smart Cart Optimization for All Users
all_users = df['user_id'].unique()
results = []

for user in all_users:
    user_data = df[df['user_id'] == user]
    user_X = user_data[['user_id', 'product_id', 'order_number', 'order_dow', 'order_hour_of_day', 'days_since_prior_order']]
    
    # Impute missing values
    user_X_imputed = pd.DataFrame(imputer.transform(user_X), columns=user_X.columns)
    
    # Make predictions
    predictions = best_model.predict(user_X_imputed)
    
    # Recommended Products
    recommended_products = user_data[predictions == 1]
    
    if not recommended_products.empty:
        # Best Time and Day of Week to Shop
        best_dow = recommended_products['order_dow'].mode()[0] if not recommended_products['order_dow'].mode().empty else None
        best_hour = recommended_products['order_hour_of_day'].mode()[0] if not recommended_products['order_hour_of_day'].mode().empty else None
        
        # Creating a shopping cart with top 10 items
        top_10_products = recommended_products['product_id'].value_counts().index[:10]
        cart = top_10_products.tolist()
        
        results.append([user, best_dow, best_hour, ','.join(map(str, cart))])
    else:
        results.append([user, None, None, None])

# Save to CSV
smart_cart_df = pd.DataFrame(results, columns=['user_id', 'best_dow', 'best_hour', 'shopping_cart'])

# Writing to a specific path with a specific file name
output_path = 'path'
output_filename = 'smart_cart.csv'
full_output_path = output_path + output_filename

smart_cart_df.to_csv(full_output_path, index=False)


Accuracy for Decision Tree: 0.6775
Accuracy for Logistic Regression: 0.6575
Accuracy for AdaBoost: 0.7025
