In [1]:
# Loading libraries
from datetime import datetime, timedelta,date
import pandas as pd
%matplotlib inline
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading online retail data
file_path = "C://Users//lenovo//Desktop//7LYTIX//online_retail_II.csv"
df = pd.read_csv(file_path, encoding= 'ISO-8859-1')
# Preview the original data
df.head()

Unnamed: 0,Invoice,StockCode,Description,Quantity,InvoiceDate,Price,Customer ID,Country
0,489434,85048,15CM CHRISTMAS GLASS BALL 20 LIGHTS,12,12/1/2009 7:45,6.95,13085.0,United Kingdom
1,489434,79323P,PINK CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom
2,489434,79323W,WHITE CHERRY LIGHTS,12,12/1/2009 7:45,6.75,13085.0,United Kingdom
3,489434,22041,"RECORD FRAME 7"" SINGLE SIZE",48,12/1/2009 7:45,2.1,13085.0,United Kingdom
4,489434,21232,STRAWBERRY CERAMIC TRINKET BOX,24,12/1/2009 7:45,1.25,13085.0,United Kingdom


In [3]:
# Check for missing values
df.isnull().sum()

Invoice             0
StockCode           0
Description      4372
Quantity            0
InvoiceDate         0
Price               0
Customer ID    236682
Country             0
dtype: int64

In [4]:
# Remove missing values
df.dropna(inplace=True)

In [5]:
#create dataframe for InvoiceDate
onlineRetailer_data['InvoiceDate'] = pd.to_datetime(onlineRetailer_data['InvoiceDate'])

NameError: name 'onlineRetailer_data' is not defined

In [None]:
# For training the model, we will take into consideration the last 9 months in the data set
# so that, we will take 6 months behavioral data between March 2011 and August 2011,
# and for the next purchase date, we will take 3 month in advance (between September 2011 and October 2011)
first_period = onlineRetailer_data[(onlineRetailer_data.InvoiceDate >= datetime(2011,1,1)) & (onlineRetailer_data.InvoiceDate < datetime(2011,9,1))].reset_index(drop=True)
next_period = onlineRetailer_data[(onlineRetailer_data.InvoiceDate >= datetime(2011,9,1)) & (onlineRetailer_data.InvoiceDate < datetime(2011,12,1))].reset_index(drop=True)

In [None]:
#create dataframe for customer ides
tx_user = pd.DataFrame(first_period['Customer ID'].unique())
tx_user.columns = ['Customer ID']

In [None]:
# Creating the feature 'NextPurchaseDay' which represents the next purchase day between
# the first period (6 monthes) and the second period (next 3 monthes)
# in odrder to include it in the customer feature vectors

# Create a dataframe with customer id and first purchase date in next_period
next_period_first_purchase = next_period.groupby('Customer ID').InvoiceDate.min().reset_index()
next_period_first_purchase.columns = ['Customer ID','MinPurchaseDate']

# Create a dataframe with customer id and last purchase date in first_period
tx_last_purchase = first_period.groupby('Customer ID').InvoiceDate.max().reset_index()
tx_last_purchase.columns = ['Customer ID','MaxPurchaseDate']

# Merge two dataframes
tx_purchase_dates = pd.merge(tx_last_purchase,next_period_first_purchase,on='Customer ID',how='left')

# Calculate the time difference in days:
tx_purchase_dates['NextPurchaseDay'] = (tx_purchase_dates['MinPurchaseDate'] - tx_purchase_dates['MaxPurchaseDate']).dt.days

# Merge with tx_user 
tx_user = pd.merge(tx_user, tx_purchase_dates[['Customer ID','NextPurchaseDay']],on='Customer ID',how='left')

#fill NA values with -1
tx_user = tx_user.fillna(-1)

In [None]:
# Creating the feature 'Recency' which represents the number of days that have passed 
# since the customer last purchased (in the first period)
# in odrder to include it in the customer feature vectors

# Get max purchase date for Recency and create a dataframe
tx_max_purchase = first_period.groupby('Customer ID').InvoiceDate.max().reset_index()

tx_max_purchase.columns = ['Customer ID','MaxPurchaseDate']

# Calculate the recency in days
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
# Add 'recency' feature to tx_user
tx_user = pd.merge(tx_user, tx_max_purchase[['Customer ID','Recency']], on='Customer ID')

In [None]:
# Creating the feature 'Frequency' which represents the number of purchases by the customer (in the first period)
# in odrder to include it in the customer feature vectors

# Get total count of purchases
tx_frequency = first_period.groupby('Customer ID').InvoiceDate.count().reset_index()
tx_frequency.columns = ['Customer ID','Frequency']

# Add 'Frequency' feature to tx_user
tx_user = pd.merge(tx_user, tx_frequency, on='Customer ID')

In [None]:
# Creating the feature 'Revenue' wehere Revenue = quantity*price (in the first period)
# in odrder to include it in the customer feature vectors

first_period['Revenue'] = first_period['Price'] * first_period['Quantity']
tx_revenue = first_period.groupby('Customer ID').Revenue.sum().reset_index()

# Add 'Revenue' feature to tx_user
tx_user = pd.merge(tx_user, tx_revenue, on='Customer ID')

In [None]:
tx_user.head()

In [None]:
# Now, we will create new three features depedning on the shift() method (shift features), 
# to find out the time delay between the last three purchases

# Create a dataframe with CustomerID and Invoice Date
tx_day_order = first_period[['Customer ID','InvoiceDate']]
tx_day_order['InvoiceDate'] = tx_day_order['InvoiceDate'].dt.date

tx_day_order = tx_day_order.sort_values(['Customer ID','InvoiceDate'])
#drop duplicates
tx_day_order = tx_day_order.drop_duplicates(subset=['Customer ID','InvoiceDate'],keep='first')

In [None]:
#shifting last 3 purchase dates
tx_day_order['PrevInvoiceDate'] = tx_day_order.groupby('Customer ID')['InvoiceDate'].shift(1)
tx_day_order['T2InvoiceDate'] = tx_day_order.groupby('Customer ID')['InvoiceDate'].shift(2)
tx_day_order['T3InvoiceDate'] = tx_day_order.groupby('Customer ID')['InvoiceDate'].shift(3)

In [None]:
tx_day_order.head()

In [None]:
tx_day_order['DayDiff'] = (tx_day_order['InvoiceDate'] - tx_day_order['PrevInvoiceDate']).dt.days
tx_day_order['DayDiff2'] = (tx_day_order['InvoiceDate'] - tx_day_order['T2InvoiceDate']).dt.days
tx_day_order['DayDiff3'] = (tx_day_order['InvoiceDate'] - tx_day_order['T3InvoiceDate']).dt.days

In [None]:
# Preview the 'shift features'
tx_day_order.head()

In [None]:
# Calculating the mean and the standard deviation of the 'shift features' aggrigated by customer
tx_day_diff = tx_day_order.groupby('Customer ID').agg({'DayDiff': ['mean','std']}).reset_index()
tx_day_diff.columns = ['Customer ID', 'DayDiffMean','DayDiffStd']

In [None]:
#drop duplicates and keep the last values
tx_day_order_last = tx_day_order.drop_duplicates(subset=['Customer ID'],keep='last')

In [None]:
tx_day_order_last = tx_day_order_last.dropna()
tx_day_order_last = pd.merge(tx_day_order_last, tx_day_diff, on='Customer ID')
# Mergee shift feautes with the pervious feautres
tx_user = pd.merge(tx_user, tx_day_order_last[['Customer ID','DayDiff','DayDiff2','DayDiff3','DayDiffMean','DayDiffStd']], on='Customer ID')

In [None]:
# Preview the last version of tx_user included the final feature vectors
tx_user.head()

In [None]:
# Create tx_class as a copy of tx_user before applying get_dummies
# Kindly note the the class dataframe will contain the label attribute depending on the NextPurchaseDay feature:
# if NextPurchaseDay <= 20 --> class values = 2
# if NextPurchaseDay between 20 and 50 --> class values = 1
# if NextPurchaseDay > 50 --> class values = 0
tx_class = tx_user.copy()
tx_class = pd.get_dummies(tx_class)

In [None]:
tx_class['NextPurchaseDayRange'] = 2
tx_class.loc[tx_user.NextPurchaseDay>20,'NextPurchaseDayRange'] = 1
tx_class.loc[tx_user.NextPurchaseDay>50,'NextPurchaseDayRange'] = 0

In [None]:
# Preview the tx_class
tx_class.head()

In [None]:
feature_cols = ['Recency','Frequency','Revenue','DayDiffMean','DayDiffStd']
X = tx_class[feature_cols] # Features
y = tx_class.NextPurchaseDayRange # Target variable

In [None]:
#train & test split
#tx_class = tx_class.drop('NextPurchaseDay',axis=1)
#X, y = tx_class.drop('NextPurchaseDayRange',axis=1), tx_class.NextPurchaseDayRange
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
# Measure the model accuracy 
kfold = KFold(n_splits=2, random_state=22)
acc = cross_val_score(clf,X_train,y_train, cv = kfold,scoring = "accuracy")
print("Accuracy:",acc)

In [None]:
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1','2'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('diabetes.png')
Image(graph.create_png())