# Instacart Product Recommender
**Michael Feeley**  
**Metis Bootcamp - Project 4**

**===================================================================================================================**

# Preprocessing

**===================================================================================================================**

### -----------------------

## Load Data

### -----------------------

#### Import Modules

In [1]:
# EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Packing data
import pickle

#### Unpack Data

In [4]:
path = "Projects"

# Aisles
df_aisles = pd.read_csv(f'~/data/instacart-market-basket-analysis/aisles.csv')

# Departments
df_departments = pd.read_csv('~/data/instacart-market-basket-analysis/departments.csv')

# Details of all customer's prior 30 orders
df_prior = pd.read_csv('~/data/instacart-market-basket-analysis/order_products__prior.csv')

# Training set containing the most recent carts (next order to predict)
df_train = pd.read_csv('~/data/instacart-market-basket-analysis/order_products__train.csv')

# Order information
df_orders = pd.read_csv('~/data/instacart-market-basket-analysis/orders.csv')

# Products
df_products = pd.read_csv('~/data/instacart-market-basket-analysis/products.csv')

FileNotFoundError: [Errno 2] File b'/Users/mffeeley/data/instacart-market-basket-analysis/aisles.csv' does not exist: b'/Users/mffeeley/data/instacart-market-basket-analysis/aisles.csv'

#### Merge the order history with order details

In [None]:
# Merge the order history with additional information
df_prior = df_prior.merge(df_orders.drop(columns = ['eval_set']), on = 'order_id')

# Merge training set with additional information
df_train = df_train.merge(df_orders.drop(columns = ['eval_set']), on = 'order_id')

#### Subset train

In [None]:
# Narrow down the scope of the user_id's in the training dataset
df_train = df_train[df_train.user_id <= 5000]

#### Subset prior

In [None]:
# Filter df_prior to include only user_id's that are in df_train
df_prior = df_prior[df_prior['user_id'].isin(df_train['user_id'].unique())]

### -----------------------

## TARGET

### -----------------------

In [None]:
# This will aggregate the dataframe to create a new one with the current cart
current_cart = (df_train.groupby('user_id', as_index = False)
                .agg({'product_id': (lambda x: set(x))})
                .rename(columns = {'product_id': 'current_cart'}))

In [None]:
# Every product ordered for every user
df_user_products = df_prior.groupby(['user_id','product_id'], as_index = False).size().reset_index().drop(columns = [0])

In [None]:
# Merge the user_product dataframe with the current cart for each user
df_ml = df_user_products.merge(current_cart, on = 'user_id')

# Create target variable (product in cart or not in cart)
df_ml['TARGET'] = df_ml.apply(lambda row: row['product_id'] in row['current_cart'], axis = 1).astype(int)

# Drop the current cart
df_ml.drop(columns = ['current_cart'], inplace = True)

In [None]:
def reset_df(df_ml):
    '''
    Resets df_ml to user_id, product_id, and TARGET.
    '''
   
    # Merge the user_product dataframe with the current cart for each user
    df_ml = df_user_products.merge(current_cart, on = 'user_id')

    # Create target variable (product in cart or not in cart)
    df_ml['TARGET'] = df_ml.apply(lambda row: row['product_id'] in row['current_cart'], axis = 1).astype(int)

    # Drop the current cart
    df_ml.drop(columns = ['current_cart'], inplace = True)
    
    # Return the fresh df_ml
    return df_ml

In [None]:
df_ml

In [None]:
# Get the percentages of each class (relative frequencies)
target_classes = df_ml.TARGET.value_counts(normalize = True)
target_classes.plot(kind = 'bar');

In [None]:
# Create function to plot features for analysis
def plot_features(df, sample_size = 1000):
    
    # Takes a sample of 1000 rows and drops irrelevant data (non-influencing)
    sample = (df.drop(['user_id','product_id'], axis = 1)
                .sample(1000, random_state = 44))
    
    # Pairplot to display any possible class differentiation strength for the feature
    sns.pairplot(sample, hue='TARGET', plot_kws = dict(alpha = .3, edgecolor = 'none'))

### -----------------------

## Feature Engineering

### -----------------------

### User Features:
* **Total Orders**
* **Total Products**
* **Average Cartsize**
* **Average Days Between Orders**

In [None]:
# Create labels
user_feature_labels = ['user_total_prods','user_avg_days_btwn', 'user_total_orders', 'user_avg_cartsize']

# Generate the user features
df_user_features = (df_prior.groupby('user_id', as_index = False)
                    .agg({'product_id':'nunique',
                          'days_since_prior_order':'mean',
                          'order_id':['nunique', lambda x: x.shape[0] / x.nunique()]}))
# Apply labels
df_user_features.columns = ['user_id'] + user_feature_labels

# Add to machine learning dataframe
df_ml = df_ml.merge(df_user_features, on = 'user_id')

# Display
df_ml.head(3)

### Product Features

In [None]:
# Create labels
prod_feature_labels = ['prod_total_orders','prod_avg_cart_pos','prod_avg_hour','prod_avg_day']

# Generate the product features
df_prod_features = (df_prior.groupby('product_id', as_index = False)
                    .agg({'order_id':'count',
                          'add_to_cart_order':'mean',
                          'order_hour_of_day':'mean',
                          'order_dow':'mean'}))

# Label the product features
df_prod_features.columns = ['product_id'] + prod_feature_labels

# Merge
df_ml = df_ml.merge(df_prod_features, on = 'product_id')

### User-Product Features:
* **Total Orders**
* **Average Day**
* **Average Time**
* **Avertage Cart Position**
* **(...)**

In [None]:
# Create the product feature label
user_prod_feature_labels = ['user_prod_total_orders','user_prod_avg_day','user_prod_avg_time','user_prod_avg_cart_position']

# Genrate the product feature
df_user_prod_features = (df_prior.groupby(['user_id','product_id'], as_index = False)
                            .agg({'order_id':'count',
                                  'order_dow':'mean',
                                  'order_hour_of_day':'mean',
                                  'add_to_cart_order':'mean'}))

# Label the product feature
df_user_prod_features.columns = ['user_id','product_id'] + user_prod_feature_labels

# Merge
df_ml = df_ml.merge(df_user_prod_features, on = ['user_id','product_id'])

* **(...)**
* **Order Since Last Ordered**
* **(...)**

In [None]:
# Generate feature labels
cart_info_features = ['cart']

# Generate features
df_cart_info = (df_prior.groupby(['user_id','order_number'], as_index = False)
                .agg({'product_id':(lambda x: list(x))}))

# Label Features
df_cart_info.columns = ['user_id','order_number'] + cart_info_features

# Every cart for each user in sequential order
df_cart_info.head()

In [None]:
# Every product ordered for every user, with their latest cart
df_cart_info = df_cart_info.merge(df_user_products, on = 'user_id')
df_cart_info.head()

In [None]:
# Product_in_order indicator
df_cart_info['prod_in_order'] = df_cart_info.apply(lambda x: int (x['product_id'] in x['cart']), axis = 1)
df_cart_info.head()

In [None]:
# List of boolean 1's and 0's to indicate each product's presence in each user's order
df_cart_info = (df_cart_info.groupby(['user_id','product_id'], as_index = False)
                .agg({'prod_in_order':lambda x: list(x)}))

df_cart_info.head()

In [None]:
# How many order since the product was last ordered by the user
df_cart_info['orders_since'] = df_cart_info['prod_in_order'].apply(lambda x: x[::-1].index(1))
df_cart_info.head()

In [None]:
# Add the orders_since user_product feature to the dataframe
df_ml = (df_ml.merge(df_cart_info.drop(columns = ['prod_in_order']), on = ['user_id','product_id']))

df_ml.head()

* **(...)**
* **Order Frequency**

In [None]:
# Divide the total orders with the product by the total orders
df_ml['user_prod_order_freq'] = df_ml.user_prod_total_orders / df_ml.user_total_orders

In [None]:
df_ml.head()

### Packing the Data

In [None]:
with open("pickles/df_ml", "wb") as file:
    pickle.dump(df_ml, file)