In [20]:
import numpy as np
import pandas as pd
from datetime import date
import datetime as dt
import calendar as calendar

In [21]:
def load_csvs(customerfile, orderfile, verbose):
    customers = pd.read_csv(customerfile)

    if verbose: 
        print(f'{len(customers):,} read from customer file')


    orders = pd.read_csv(orderfile)
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])
    orders['order_purchase_date'] = orders['order_purchase_timestamp'].dt.date
    orders['order_dow']= orders['order_purchase_timestamp'].dt.strftime(('%A'))

    
    if verbose :
        print((f'{len(orders):,} read from orders file'))

    ordercustsout = pd.merge(customers,orders, on = "customer_id", how = "inner")


    ordercustsout = ordercustsout.sort_values(['customer_unique_id', 'order_purchase_timestamp'])
    
    return ordercustsout

orders_file = 'Data\olist_orders_dataset.csv'
customers_file = 'Data\olist_customers_dataset.csv'



    ##Load data and create DF to last full month, june 2018
custorders_cleaned = load_csvs(customers_file, orders_file, 1)

99,441 read from customer file
99,441 read from orders file


In [22]:
snapshot_date = date(2017, 12, 31)
mask1 = custorders_cleaned['order_purchase_timestamp'].dt.date <= snapshot_date
df_raw = custorders_cleaned[mask1]

df_raw.size



636020

## Attributes 1 3:


In [23]:

## Attribute 1: Number of orders per customer ##
def cust_orders(df): 
    custorders =  df.groupby('customer_unique_id').size().reset_index()
    print("cust_orders completed")
    return custorders


cust_ordersreturned = cust_orders(df_raw)


cust_ordersreturned.size

cust_orders completed


88068

In [24]:
##Attribute 2: 
def get_customer_recency(df):
    cust_recency = df.groupby('customer_unique_id')['order_purchase_timestamp'].max().reset_index()
    delta = snapshot_date - cust_recency['order_purchase_timestamp'].dt.date
    cust_recency['order_recency'] = delta.dt.days / 30
    cust_recency['order_recency'] = cust_recency['order_recency'].astype(int)

    cust_recency = cust_recency.drop('order_purchase_timestamp', axis=1)

    return cust_recency


cust_recency = get_customer_recency(df_raw)

cust_recency.size

88068

In [25]:
##Attribute 3: 

df_raw12 = pd.get_dummies(df_raw, columns=['order_dow'])
dummies = [x for x in df_raw12.columns if 'order_dow_' in x]

# aggreagte data to the customer-level, so that we have one record per customer
cust_dow = df_raw12.groupby('customer_unique_id')[dummies].sum().reset_index()

cust_dow.shape



(44034, 8)

In [27]:
cust_dow.shape,cust_recency.shape, cust_ordersreturned.size

((44034, 8), (44034, 2), 88068)

## Attribute 4: Counts of orders by status 

In [28]:
def orderbystatus(df): 

    df_raw = pd.get_dummies(df, columns=['order_status'])
    dummies = [x for x in df_raw.columns if 'order_status' in x]
    count_by_orderstatus = df_raw.groupby('customer_unique_id')[dummies].sum().reset_index()
    return count_by_orderstatus



orderstatuscounts = orderbystatus(df_raw)

orderstatuscounts.size

396306

## Attribute 5: Time between estimated and actual delivery dates

In [29]:

    
df = df_raw

dummies = ['Morning', 'Afternoon', 'Evening']
df['order_purchase_timestamp'] = pd.to_datetime(df.order_purchase_timestamp, format='%Y-%m-%d %H:%M:%S')
df['Late Night'] = df['order_purchase_timestamp'].dt.time.between(dt.time(0), dt.time(5)).astype(int)
df['Morning'] = df['order_purchase_timestamp'].dt.time.between(dt.time(5,0,00,1), dt.time(11)).astype(int)
df['Afternoon'] = df['order_purchase_timestamp'].dt.time.between(dt.time(11,0,00,1), dt.time(16)).astype(int)
df['Evening'] = df['order_purchase_timestamp'].dt.time.between(dt.time(16,0,00,1), dt.time(23, 59, 59, 999999)).astype(int)

PM = df.groupby('customer_unique_id')[dummies].sum().reset_index()

PM.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['order_purchase_timestamp'] = pd.to_datetime(df.order_purchase_timestamp, format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Late Night'] = df['order_purchase_timestamp'].dt.time.between(dt.time(0), dt.time(5)).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(44034, 4)

In [None]:
Attribute 6

customer_id                         0
customer_unique_id                  0
customer_zip_code_prefix            0
customer_city                       0
customer_state                      0
order_id                            0
order_status                        0
order_purchase_timestamp            0
order_approved_at                  78
order_delivered_carrier_date     1192
order_delivered_customer_date    1732
order_estimated_delivery_date       0
order_purchase_date                 0
order_dow                           0
Morning                             0
Afternoon                           0
Evening                             0
Late Night                          0
dtype: int64

In [30]:
def time_between_estimated_and_actual(df):
    df['order_delivered_customer_date'] = pd.to_datetime(df.order_delivered_customer_date, format='%Y-%m-%d %H:%M:%S')
    df['order_estimated_delivery_date'] = pd.to_datetime(df.order_estimated_delivery_date, format='%Y-%m-%d %H:%M:%S')
    mask = df['order_delivered_customer_date'].dt.date <= snapshot_date  
    df = df[mask]
    
    delta = df['order_delivered_customer_date'].dt.date - df['order_estimated_delivery_date'].dt.date


    df['delta'] = delta.dt.days.astype(int)

    time_between =  df.groupby('customer_unique_id')['delta'].mean().reset_index()
    
    return time_between

timebetween = time_between_estimated_and_actual(df_raw)

timebetween.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['order_delivered_customer_date'] = pd.to_datetime(df.order_delivered_customer_date, format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['order_estimated_delivery_date'] = pd.to_datetime(df.order_estimated_delivery_date, format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

(39969, 2)

# Merge them 

In [31]:
df_merged = pd.merge(cust_recency, cust_ordersreturned, on='customer_unique_id').merge(cust_dow, on='customer_unique_id').merge(timebetween, on='customer_unique_id').merge(PM, on ='customer_unique_id' ).merge(orderstatuscounts, on='customer_unique_id')

(39969, 22)

### Assign labels (aka the target variable or the dependent variable)


In [45]:
start_date = dt.date(2018, 1, 1)
end_date = dt.date(2018, 7, 31)


mask= (custorders_cleaned['order_purchase_timestamp'].dt.date >= start_date) & (custorders_cleaned['order_purchase_timestamp'].dt.date < end_date)

target_events_raw = custorders_cleaned[mask]

print (target_events_raw['order_purchase_timestamp'].min(), target_events_raw['order_purchase_timestamp'].max())

2018-01-01 02:48:41 2018-07-30 23:54:48


In [46]:
target_events = target_events_raw.groupby('customer_unique_id').size().reset_index().rename(columns={0: 'purch'})

target_events.head()

Unnamed: 0,customer_unique_id,purch
0,0000366f3b9a7992bf8c76cfdf3221e2,1
1,0000b849f77a49e4a4ce2b2a4ca5be3f,1
2,0004bd2a26a76fe21f786e4fbd80607f,1
3,00050ab1314c0e55a6ca13cf7181fecf,1
4,00053a61a98854899e70ed204dd4bafe,1


In [47]:
print(f'Number of customers who made at least one purchase durnig the prediction window: {len(target_events):,}')

Number of customers who made at least one purchase durnig the prediction window: 46,096


In [48]:
df = pd.merge(df_merged, target_events, how="left", on="customer_unique_id")

df.shape

df.head()

df.purch.value_counts()

df['purch'] = [1 if x > 0 else 0 for x in df['purch']]
df.purch.value_counts()


0    39396
1      573
Name: purch, dtype: int64

In [49]:
df.purch.value_counts() / len(df)


df.corr()['purch']

order_recency              -0.012516
0                           0.073521
order_dow_Friday           -0.001087
order_dow_Monday            0.013487
order_dow_Saturday         -0.002927
order_dow_Sunday            0.012563
order_dow_Thursday          0.006610
order_dow_Tuesday           0.004898
order_dow_Wednesday         0.007857
delta                      -0.003191
Morning                     0.013796
Afternoon                   0.006352
Evening                     0.007979
order_status_approved            NaN
order_status_canceled      -0.003250
order_status_created             NaN
order_status_delivered      0.073467
order_status_invoiced      -0.002001
order_status_processing     0.012218
order_status_shipped        0.012383
order_status_unavailable    0.009497
purch                       1.000000
Name: purch, dtype: float64

# Model Building

In [51]:
preds = df.columns[1:-1]

X = df[preds]
y = df['purch']



## Two fold partition


In [52]:
## Splitting Data into train and test: 

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=314)

len(X_train), len(X_test)

(19984, 19985)

In [57]:
print(y_train.value_counts())

print(y_test.value_counts())

0    19705
1      279
Name: purch, dtype: int64
0    19691
1      294
Name: purch, dtype: int64


## Logistic Regression

In [177]:
from sklearn.linear_model import LogisticRegression

# define the model object ('liblinear' is recommended for small datasets)
clf = LogisticRegression(solver='liblinear', class_weight='balanced', random_state=314)

# train (fit) the model using the training sample
clf.fit(X_train, y_train)

# make predictions on the test sample
y_preds = clf.predict(X_test)
scores_all = clf.predict_proba(df[preds])[:, 1]
df['prob_to_purchase_from_log'] = scores_all

df.head()

Unnamed: 0,customer_unique_id,order_recency,0,order_dow_Friday,order_dow_Monday,order_dow_Saturday,order_dow_Sunday,order_dow_Thursday,order_dow_Tuesday,order_dow_Wednesday,...,order_status_canceled,order_status_created,order_status_delivered,order_status_invoiced,order_status_processing,order_status_shipped,order_status_unavailable,purch,prob_to_purchase_from_log,prob_to_purchase_fromtree
0,0000f46a3911fa3c0805444483337064,9,1,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0.386582,0.002706
1,0000f6ccb0745a6a4b88665a16c9f078,2,1,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0.464632,0.015079
2,0004aac84e0df4da2b147fca70cf8255,1,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0.53209,0.01208
3,0005e1862207bf6ccc02e4228effd9a0,10,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0.380984,0.010554
4,0006fdc98a402fceb4eb0ee528f6a8d4,5,1,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0.499743,0.015079


## Model Accuracy

In [183]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_preds)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
pd.crosstab(y_test, y_preds)



Accuracy: 0.9849887415561671


col_0,0,1
purch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,13371,6320
1,177,117


### AUC 

In [179]:
from sklearn.metrics import roc_auc_score

# calculate the probabilities on the test sample
y_scores = clf.predict_proba(X_test)[:, 1]

# calcualte AUC
roc_auc_score(y_test, y_scores)

0.5393123934861639

### Precision

In [127]:
from sklearn import tree
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation


average_precision = average_precision_score(y_test, y_scores)

average_precision

0.03174239092272168

## Decision Tree

In [170]:
# define the model object


dec_tree = DecisionTreeClassifier(criterion="entropy", max_depth=3)

clf = dec_tree.fit(X_train, y_train)

y_train_hat = dec_tree.predict(X_train)

y_test_hat = dec_tree.predict(X_test)

scores_alltree = clf.predict_proba(df[preds])[:, 1]

df['prob_to_purchase_fromtree'] = scores_alltree

df.head()



## Confusion Matrix

In [154]:
pd.crosstab(y_test, y_pred)



col_0,0,1
purch,Unnamed: 1_level_1,Unnamed: 2_level_1
0,19684,7
1,293,1


In [155]:
from sklearn.metrics import (roc_curve, auc, roc_auc_score,
                             confusion_matrix)
import matplotlib.pyplot as plt
import numpy as np
import itertools

## AUC For Tree

In [157]:
def get_auc_scores(clf, X_train, X_test, y_train, y_test):
    ##Prints the AUC scores for training and testing data
   ##and returns testing score”””
 
 y_train_score = clf.predict_proba(X_train)[:, 1]
 y_test_score = clf.predict_proba(X_test)[:, 1]
 auc_train = roc_auc_score(y_train, y_train_score)
 auc_test = roc_auc_score(y_test, y_test_score)
 print(f" Training AUC: {auc_train} Testing AUC: {auc_test}")
 
 return y_test_score

 


In [158]:
get_auc_scores(clf, X_train, X_test, y_train, y_test)

 Training AUC: 0.5928652826320849 Testing AUC: 0.5232829356413735


array([0.01507867, 0.01507867, 0.01208026, ..., 0.01055352, 0.01208026,
       0.01507867])

## Pearson's distance

In [176]:
from scipy import stats

stats.pearsonr(df['prob_to_purchase_fromtree'], df['prob_to_purchase_from_log'])


(0.7322266286067082, 0.0)

In [171]:
print(df.columns)

Index([       'customer_unique_id',             'order_recency',
                                 0,          'order_dow_Friday',
                'order_dow_Monday',        'order_dow_Saturday',
                'order_dow_Sunday',        'order_dow_Thursday',
               'order_dow_Tuesday',       'order_dow_Wednesday',
                           'delta',                   'Morning',
                       'Afternoon',                   'Evening',
           'order_status_approved',     'order_status_canceled',
            'order_status_created',    'order_status_delivered',
           'order_status_invoiced',   'order_status_processing',
            'order_status_shipped',  'order_status_unavailable',
                           'purch', 'prob_to_purchase_from_log',
       'prob_to_purchase_fromtree'],
      dtype='object')


### K Fold 