# Feature Engineering

This notebook builds the features for the model

In [1]:
from constants import *
import pandas as pd
import numpy as np
import time

In [2]:
aisles = pd.read_csv(AISLES)
departments = pd.read_csv(DEPARTMENTS)
order_products_prior = pd.read_csv(ORDER_PRODUCTS_PRIOR)
order_products_train = pd.read_csv(ORDER_PRODUCTS_TRAIN)
orders = pd.read_csv(ORDERS)
products = pd.read_csv(PRODUCTS)

In [3]:
def collapseNames(c):
    '''Flattens multi-index'''
    return map(lambda x: "_".join([x] if type(x) == str else x).replace("<lambda>", "").strip("_"), c)

In [4]:
def buildFeatures(or_target, or_past, pr, op_past, op_train, verbose = False):
    '''Build feature matrix'''
    
    # get triplets (user_id, order_id, product_id) for all the candidates
    # in the target sets
    if verbose:
        start = time.time()
        print "Retrieving candidates..."
    joined_op = (or_target.join(or_past.set_index("user_id"), 
                                on = "user_id", rsuffix = "_past")
                 .join(op_past.set_index("order_id"), 
                       on = "order_id_past")
                 [["user_id", "order_id", "product_id", "eval_set"]]
                 .set_index("user_id")
                 .drop_duplicates()
                 .reset_index())
    
    # compute the response -- test response is irrelevant so those will
    # be padded with zeros for simplicity
    if verbose:
        print "Generating response..."
    y = (joined_op.join(op_train.set_index(["order_id", "product_id"]), 
                        on = ["order_id", "product_id"], how = "left")
         .rename(columns = {"reordered": "ordered"}))
    y["ordered"][y.ordered.isnull()] = 0
    
    if verbose:
        print "Beginning to build feature set..."
    
    # user order features
    if verbose:
        print "--->Getting user features"
    user_features = (or_past.groupby("user_id")
                     .agg({
                         "order_number": "max",
                         "days_since_prior_order": "mean"
                     })
                     .rename(columns = {
                         "order_number": "u_total_orders",
                         "days_since_prior_order": "u_days_since_prior_order"
                     }))
    user_features.columns = collapseNames(user_features.columns)
    
    # user products in past orders
    user_product_joined = (or_past.join(op_past.set_index("order_id"), 
                                        on = "order_id", how = "inner"))    
    
    # user reorder history by order features
    user_generic_op_features = (user_product_joined.groupby(["order_id", "user_id"])
                                .agg({
                                    "product_id": "count",
                                    "reordered": "mean"
                                })
                                .reset_index()
                                .groupby("user_id")
                                .agg({
                                    "product_id": "mean",
                                    "reordered": lambda x: np.sum(x)/(len(x) - 1) # reorder rate after first order
                                })
                                .rename(columns = {
                                    "product_id": "u_basket_size",
                                    "reordered": "u_reorder_rate_after_first_order"
                                }))
    
    # user overall product reorder history
    user_generic_pr_features = (user_product_joined.groupby("user_id")
                                .agg({
                                    "product_id": ["nunique", "count"],
                                    "reordered": "sum"
                                })
                                .rename(columns = {
                                    "product_id": "u_products",
                                    "reordered": "u_total_reorders"
                                }))
    user_generic_pr_features.columns = collapseNames(user_generic_pr_features.columns)
    
    # global product features
    if verbose:
        print "--->Getting product features"
    product_features = (op_past.groupby("product_id")
                        .agg({
                            "add_to_cart_order": "mean",
                            "reordered": ["sum", "mean"],
                            "order_id": "count"
                        })
                        .rename(columns = {
                            "add_to_cart_order": "pr_add_to_cart_order",
                            "reordered": "pr_reordered",
                            "order_id": "pr_order"
                        }))
    product_features.columns = collapseNames(product_features.columns)
    
    # order features
    if verbose:
        print "--->Getting order features"
    order_features = (or_target[["order_id", "days_since_prior_order"]]
                      .set_index("order_id")
                      .rename(columns = {
                          "days_since_prior_order": "o_days_since_prior_order"
                      }))
    
    ## user-product features
    if verbose:
        print "--->Getting user-product features"
    user_product_features = (user_product_joined.groupby(["user_id", "product_id"])
                             .agg({
                                 "reordered": "count",
                                 "add_to_cart_order": "mean",
                                 "order_number": ["min", "max"] # first and last order of product
                             })
                             .rename(columns = {
                                 "reordered": "up_reorder",
                                 "add_to_cart_order": "up_add_to_cart_order",
                                 "order_number": "up_order_number"
                             }))
    user_product_features.columns = collapseNames(user_product_features.columns)    
    
    ## custom features
    
    # Proportion of users who ordered product that reordered
    pr_reorder_prob = (user_product_features
                       .reset_index()
                       .assign(reordered_bool = 1*(user_product_features.up_reorder_count > 1).values)
                       .groupby("product_id")
                       .agg({
                           "reordered_bool": "mean",
                           "up_reorder_count": "mean"
                       })
                       .rename(columns = {
                           "reordered_bool": "pr_reorder_prob",
                           "up_reorder_count": "pr_reorder_times"
                       }))
    
    # TODO: days since last order
        
    ## join all features to build training set
    if verbose:
        print "--->Combining"
    feature_matrix = (joined_op.join(user_features, on = "user_id", how = "inner")
                      .join(user_generic_op_features, on = "user_id", how = "inner")
                      .join(user_generic_pr_features, on = "user_id", how = "inner")
                      .join(product_features, on = "product_id", how = "inner")
                      .join(order_features, on = "order_id", how = "inner")
                      .join(user_product_features, on = ["user_id", "product_id"], how = "inner")
                      .join(pr_reorder_prob, on = "product_id", how = "inner"))
    
    ## add feature transformations
    if verbose:
        print "--->Making transformations"
    
    #  number of orders since last order of product
    up_orders_since_last_order = feature_matrix.u_total_orders - feature_matrix.up_order_number_max
    # number of reorders of product divided by total number of orders
    up_reorder_rate = feature_matrix.up_reorder_count / feature_matrix.u_total_orders
    # reorder rate since first order of product
    up_reorder_rate_since_first = ((feature_matrix.up_reorder_count - 1) /
        (feature_matrix.u_total_orders - feature_matrix.up_order_number_min))
    
    feature_matrix = feature_matrix.assign(up_orders_since_last_order = up_orders_since_last_order,
                                           up_reorder_rate = up_reorder_rate,
                                           up_reorder_rate_since_first = up_reorder_rate_since_first)
    
    
    full = feature_matrix.join(y.set_index(["user_id", "order_id", "product_id"])[["ordered"]],
                               on = ["user_id", "order_id", "product_id"], how = "inner")
    
    if verbose:
        finish_time = time.time() - start
        print "Done in %s seconds" % str(finish_time)
    
    return full

In [5]:
# or_target = orders[(orders.eval_set == "train") | (orders.eval_set == "test")]
# or_past = orders[orders.eval_set == "prior"]
# pr = products
# op_past = order_products_prior
# op_train = order_products_train

In [6]:
mat = buildFeatures(orders[(orders.eval_set == "train") | (orders.eval_set == "test")],
                    orders[orders.eval_set == "prior"], products, order_products_prior, 
                    order_products_train, verbose = True)

Retrieving candidates...
Generating response...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Beginning to build feature set...
--->Getting user features
--->Getting product features
--->Getting order features
--->Getting user-product features
--->Combining
--->Making transformations
Done in 210.048483849 seconds


In [7]:
mat[mat.eval_set == "train"].to_pickle(TRAIN)
mat[mat.eval_set == "test"].to_pickle(TEST)