In [11]:
#import all the necessary libraries
import joblib
import os
import hyperopt
import pandas as pd 
import numpy as np 
import seaborn as sb
import scipy.stats as st
from datetime import datetime 
from sklearn.utils import resample
import chart_studio.plotly as py
import plotly.graph_objects as go
import plotly.offline as pyoff
import matplotlib.pyplot as plt 
import plotly.express as px
import plotly.io as pio
from warnings import simplefilter
from sklearn.cluster import KMeans
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from skopt  import BayesSearchCV
import json
from sklearn.linear_model import LogisticRegression
from scipy.stats import ( beta, expon, randint, uniform)
from sklearn.base import (TransformerMixin, BaseEstimator)
from sklearn.metrics import ( roc_curve, auc, accuracy_score, roc_auc_score,log_loss,confusion_matrix,classification_report)
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import ( FeatureUnion, Pipeline )
from sklearn.preprocessing import ( OneHotEncoder, LabelBinarizer, LabelEncoder, StandardScaler, MinMaxScaler, MaxAbsScaler )
from sklearn.model_selection import ( GridSearchCV, StratifiedKFold, train_test_split, cross_val_score, RandomizedSearchCV, KFold )
simplefilter(action='ignore', category=FutureWarning)

In [12]:
MODELS_PATH = '../../models/'

In [13]:

class DataPreprocessing(object):

    def load_data(self, data):
        """ Method to load data from json file"""
        df = None
        if os.path.isdir(data):
            if str(data).split('.')[-1]=='json':
                df = pd.read_json(data)
            
            if str(data).split('.')[-1]=='xlsx':
                df = pd.read_ecel(data)

            if str(data).split('.')[-1]=='csv':
                df = pd.read_csv(data)
        else:
            df = pd.read_json(data)
        return df

    def drop_nan(self, data):
        """Method that takes in a text and removes all null values."""
        return data.dropna(how='all').reset_index(drop=True)

    def drop_duplicates(self, data):
        """Method that takes in a text and removes all null values."""
        return data.drop_duplicates(keep='last').reset_index(drop=True)

    def string_to_date(self, data, column='paid_at'):
        """Method that converts date to string datetime."""
        data[column] = pd.to_datetime(data[column])
        return  data

    def string_to_date(self, data, column='paid_at'):
        """Method that converts date to string datetime."""
        data[column] = pd.to_datetime(data[column])
        return  data

    def previous_data(self, data,from_date, to_date, column='paid_at'):
        """Method that return purchase behaviour for last defined time period."""
        return data[(data[column] < to_date) & (data[column] >= from_date)].reset_index(drop=True)

    def next_data(self, data,column, from_date, to_date):
        """Method that returns the next data for check first purchase after the last time."""
        return data[(data[column] >= from_date) & (data[column] < to_date)].reset_index(drop=True)

    def get_customers(self, data, column='customer_id'):
        """filter unique customer who did purchase in previous 9 months."""
        customers = pd.DataFrame(data[column].unique())
        customers.columns = [column]
        return customers
    
    def get_last_purchase(self, previous_data, column='customer_id'):
        """create a dataframe with customer id and last purchase date in invoices_next."""
        last_purchase =previous_data.groupby(column).paid_at.max().reset_index()
        last_purchase.columns = [column,'max_purchase_date']
        return last_purchase

    def get_next_first_purchase(self, next_data, column='customer_id'):
        """create a dataframe with customer id and first purchase date in next"""
        next_first_purchase = next_data.groupby('customer_id').paid_at.min().reset_index()
        next_first_purchase.columns = ['customer_id','min_purchase_date']
        return next_first_purchase
        
    def join_last_first_purchases(self, last_purchase, next_first_purchase, column='customer_id'):
        """merge two dataframes [last purchase and first purchase]"""
        return pd.merge(last_purchase,next_first_purchase,on=column,how='left')

    def get_time_difference_between_purchases(self, purchases):
        """calculate the time difference in days:"""
        purchases['next_purchase_day'] = (purchases['min_purchase_date'] - purchases['max_purchase_date']).dt.days
        return purchases 

    def assign_time_difference_to_customers(self, customers, time_difference):
        """ merge time difference with customers"""
        customers_time_difference = pd.merge(customers, time_difference[['customer_id','next_purchase_day']],on='customer_id',how='left')
        return customers_time_difference
        

In [14]:
data_preprocessor = DataPreprocessing()

In [15]:
data_point = data_preprocessor.load_data('../../../data/datapoint.json')

In [16]:
data_point[:2]

Unnamed: 0.1,Unnamed: 0,id,customer_id,admin_id,deliverer_id,returned_items,discount,refund,total,items_count,...,paid_at,is_processed,is_informed,notification_count,status,is_received,processed_at,tagname,created_at,updated_at
4,4,6,6,0,,0,1158,0,38572,1,...,2022-05-01 07:38:06,0,0,0,Unpaid,0,0000-00-00 00:00:00,,2022-05-01 10:38:06,2022-05-01 10:38:13
5,5,7,6,0,2.0,0,0,0,1426,2,...,2022-05-01 10:44:03,1,0,0,Paid,0,0000-00-00 00:00:00,,2022-05-01 10:42:34,2022-05-19 10:36:29


#### Data Preprocessing

In [17]:
data_point = data_preprocessor.drop_nan(data_point)

In [18]:
#convert date field from string to datetime
data_point = data_preprocessor.string_to_date(data_point,'paid_at')

In [19]:
#choose 30 November as a cutt of date (1, Jan - 30 November) for behaviour data
data_point_previous = data_preprocessor.previous_data(data = data_point, from_date = datetime(2022,1,1,00,00,00), to_date= datetime(2022,11,30,23,59,00))

In [20]:
# data after cut off date (1 month window), to check for next purchase
data_point_next = data_preprocessor.previous_data(data = data_point, from_date = datetime(2022,11,30,23,59,00), to_date= datetime(2022,12,31,23,59,0))

In [21]:
#filter unique customer who did purchase in previous 9 months
data_point_customer_previous = data_preprocessor.get_customers(data_point_previous, column='customer_id')

In [22]:
data_point_customer_previous

Unnamed: 0,customer_id
0,6


In [23]:
#create a dataframe with customer id and last purchase date in invoices_previous
last_purchase_data_point = data_preprocessor.get_last_purchase(data_point_previous, column='customer_id')

In [24]:
last_purchase_data_point.head(2)

Unnamed: 0,customer_id,max_purchase_date
0,6,2022-11-30 08:19:40


In [25]:
#create a dataframe with customer id and first purchase date in next
next_first_purchase_data_point = data_preprocessor.get_next_first_purchase(data_point_next, column='customer_id')

In [26]:
next_first_purchase_data_point.head(2)

Unnamed: 0,customer_id,min_purchase_date
0,6,2022-12-01 12:18:15


In [27]:
#merge two dataframes [last purchase and first purchase]
purchase_dates = data_preprocessor.join_last_first_purchases(last_purchase_data_point, next_first_purchase_data_point, column='customer_id' )

In [28]:
purchase_dates.head(2)

Unnamed: 0,customer_id,max_purchase_date,min_purchase_date
0,6,2022-11-30 08:19:40,2022-12-01 12:18:15


In [29]:
#calculate the time difference in days:
purchase_dates = data_preprocessor.get_time_difference_between_purchases(purchases=purchase_dates)

In [30]:
purchase_dates.head(2)

Unnamed: 0,customer_id,max_purchase_date,min_purchase_date,next_purchase_day
0,6,2022-11-30 08:19:40,2022-12-01 12:18:15,1


#### Features Engineering

In [31]:
class FeaturesGeneration():

    def recency(self, data_point_previous, data_point_customer_previous):
        #get max purchase date for Recency and create a dataframe of the last purchase before cut off date
        invoices_max_purchase = data_point_previous.groupby('customer_id').paid_at.max().reset_index()
        invoices_max_purchase.columns = ['customer_id','max_purchase_date']

        #find the recency in days and add it to invoices_customers, given as the day difference between the last purchase before cut off, and other purchases before cut of
        invoices_max_purchase['Recency'] = (invoices_max_purchase['max_purchase_date'].max() - invoices_max_purchase['max_purchase_date']).dt.days
        data_point_customer_previous = pd.merge(data_point_customer_previous, invoices_max_purchase[['customer_id','Recency']], on='customer_id')

        return data_point_customer_previous

    def recency_cluster(self, data_point_customer_previous):
        #clustering for Recency, use elbow method to determine the number of clusters
        import joblib
        kmeans_recency = joblib.load(MODELS_PATH+'kmeans_recency.joblib')
        data_point_customer_previous['RecencyCluster'] = kmeans_recency.predict(data_point_customer_previous[['Recency']])
        return data_point_customer_previous

    def frequency(self, data_point_previous, data_point_customer_previous):
        #get total purchases for frequency scores
        invoices_frequency = data_point_previous.groupby('customer_id').paid_at.count().reset_index()
        invoices_frequency.columns = ['customer_id','Frequency']
        #add frequency column to invoices_customers
        data_point_customer_previous = pd.merge(data_point_customer_previous, invoices_frequency, on='customer_id')
        return data_point_customer_previous

    def frequency_cluster(self,data_point_customer_previous):
        #clustering for frequency
        import joblib
        kmeans_frequency = joblib.load(MODELS_PATH+'kmeans_frequency.joblib')
        data_point_customer_previous['FrequencyCluster'] = kmeans_frequency.predict(data_point_customer_previous[['Frequency']])
        return data_point_customer_previous

    def monetary_value(self,data_point_previous, data_point_customer_previous):
        #calculate monetary value, create a dataframe with it
        data_point_previous['Revenue'] = data_point_previous['amount'] *data_point_previous['items_count']
        invoices_revenue = data_point_previous.groupby('customer_id').Revenue.sum().reset_index()
        #add Revenue column to invoices_customers
        data_point_customer_previous = pd.merge(data_point_customer_previous, invoices_revenue, on='customer_id')
        return data_point_customer_previous

    def revenue_cluster(self,data_point_customer_previous):
        #Revenue clusters 
        import joblib
        kmeans_revenue = joblib.load(MODELS_PATH+'kmeans_revenue.joblib')
        data_point_customer_previous['RevenueCluster'] = kmeans_revenue.predict(data_point_customer_previous[['Revenue']])
        return data_point_customer_previous

    def overall_score(self, data_point_customer_previous):
        #building overall segmentation
         data_point_customer_previous['OverallScore'] =  data_point_customer_previous['RecencyCluster'] +  data_point_customer_previous['FrequencyCluster'] +  data_point_customer_previous['RevenueCluster']
         return  data_point_customer_previous

    def segments(self, data_point_customer_previous):
        #assign segment names
        data_point_customer_previous['Segment'] = 'Low-Value'
        data_point_customer_previous.loc[data_point_customer_previous['OverallScore']>2,'Segment'] = 'Mid-Value' 
        data_point_customer_previous.loc[data_point_customer_previous['OverallScore']>4,'Segment'] = 'High-Value' 
        return data_point_customer_previous

    def trace_back_three(self, data_point_previous, data_point_customer_previous):
        #create a dataframe with customer_id and Invoice Date
        invoices_day_order = data_point_previous[['customer_id','paid_at']]
        #convert Invoice Datetime to day
        invoices_day_order['InvoiceDay'] = data_point_previous['paid_at'].dt.date
        invoices_day_order = invoices_day_order.sort_values(['customer_id','paid_at'])
        #drop duplicates
        invoices_day_order = invoices_day_order.drop_duplicates(subset=['customer_id','InvoiceDay'],keep='first')
        #shifting last 3 purchase dates
        invoices_day_order['PrevInvoiceDate'] = invoices_day_order.groupby('customer_id')['InvoiceDay'].shift(1)
        invoices_day_order['T2InvoiceDate'] = invoices_day_order.groupby('customer_id')['InvoiceDay'].shift(2)
        invoices_day_order['T3InvoiceDate'] = invoices_day_order.groupby('customer_id')['InvoiceDay'].shift(3)
        #calculate the day differences between purchases (the 3 purchases gaps)
        invoices_day_order['DayDiff'] = (invoices_day_order['InvoiceDay'] - invoices_day_order['PrevInvoiceDate']).dt.days
        invoices_day_order['DayDiff2'] = (invoices_day_order['InvoiceDay'] - invoices_day_order['T2InvoiceDate']).dt.days
        invoices_day_order['DayDiff3'] = (invoices_day_order['InvoiceDay'] - invoices_day_order['T3InvoiceDate']).dt.days
        #find the mean day difference , and std 
        invoices_day_diff = invoices_day_order.groupby('customer_id').agg({'DayDiff': ['mean','std']}).reset_index()
        invoices_day_diff.columns = ['customer_id', 'DayDiffMean','DayDiffStd']
        ##we have customers who purchased only one time, We can't keep customer who has purchased one time, for this case we keep customer who has purchased atleast 3 times
        invoices_day_order_last = invoices_day_order.drop_duplicates(subset=['customer_id'],keep='last') # filter with one purchase

        invoices_day_order_last = invoices_day_order_last.dropna()
        invoices_day_order_last = pd.merge(invoices_day_order_last, invoices_day_diff, on='customer_id')
        data_point_customer_previous = pd.merge(data_point_customer_previous, invoices_day_order_last[['customer_id','DayDiff','DayDiff2','DayDiff3','DayDiffMean','DayDiffStd']], on='customer_id')
        return data_point_customer_previous

    def dummy_data(self, data_point_customer_previous):
        #create invoices_class as a copy of invoices_customers before applying get_dummies
        invoices_class = data_point_customer_previous.copy()
        invoices_class = pd.get_dummies(invoices_class)
        # drop less importand columns as per trining
        features_generated  = invoices_class.drop(['DayDiff', 'DayDiff3', 'DayDiff2', 'Recency', 'DayDiffMean','DayDiffStd'], axis=1)
        return features_generated
        



In [32]:
features_generator = FeaturesGeneration()

In [33]:
# Add recency feature 
data_point_customer_previous = features_generator.recency(data_point_previous=data_point_previous, data_point_customer_previous=data_point_customer_previous)

In [34]:
# Add recency cluster
data_point_customer_previous = features_generator.recency_cluster(data_point_customer_previous=data_point_customer_previous)

In [35]:
# Add frequency
data_point_customer_previous = features_generator.frequency(data_point_previous=data_point_previous, data_point_customer_previous=data_point_customer_previous)

In [36]:
data_point_customer_previous[:2]

Unnamed: 0,customer_id,Recency,RecencyCluster,Frequency
0,6,0,4,88


In [37]:
# add frequency cluster
data_point_customer_previous = features_generator.frequency_cluster(data_point_customer_previous=data_point_customer_previous)

In [38]:
# Add Money value
data_point_customer_previous = features_generator.monetary_value(data_point_previous=data_point_previous, data_point_customer_previous=data_point_customer_previous)

In [39]:
#add money cluster
data_point_customer_previous = features_generator.revenue_cluster(data_point_customer_previous=data_point_customer_previous)

In [40]:
# add overall score
data_point_customer_previous = features_generator.overall_score(data_point_customer_previous=data_point_customer_previous)

In [41]:
# add segments
data_point_customer_previous = features_generator.segments(data_point_customer_previous=data_point_customer_previous)

In [42]:
# Add trace back tree purchases
data_point_customer_previous = features_generator.trace_back_three(data_point_previous=data_point_previous, data_point_customer_previous=data_point_customer_previous)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [43]:
data_point_customer_previous

Unnamed: 0,customer_id,Recency,RecencyCluster,Frequency,FrequencyCluster,Revenue,RevenueCluster,OverallScore,Segment,DayDiff,DayDiff2,DayDiff3,DayDiffMean,DayDiffStd
0,6,0,4,88,4,218880079,3,11,High-Value,2.0,6.0,8.0,4.018868,3.499948


In [44]:
# add segments
data_point_customer_previous_features = features_generator.dummy_data(data_point_customer_previous=data_point_customer_previous)

In [45]:
#create a class for numerical data tansformation
class Transform(TransformerMixin,BaseEstimator):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=0):
        num_pipeline = Pipeline([('scaler',StandardScaler())])
        X = num_pipeline.fit_transform(X)
        return pd.DataFrame(X)

In [46]:
transform =  Transform()

In [47]:
data_point_customer_previous_features = transform.fit_transform(data_point_customer_previous_features)

In [48]:
def inference(data_point_features):
    model = joblib.load(MODELS_PATH+'lgbm_optim.joblib')
    inference_results = model.predict_proba(data_point_features)
    inference_results_df_probas= pd.DataFrame(data=inference_results,columns=['Purchase','Not Purchase'])
    inference_label = 1 if np.amax(inference_results)>=0.5 else 0
    if inference_label == 1:
        print("Customer is {:.2f}% likely to purchase in the coming 1 month".format(np.amax(inference_results)*100))
    elif inference_label == 0:
        print("Customer is {:.2f}% unlikely to purchase in the coming 1 month".format(np.amax(inference_results)*100))
    else:
        print('Unpredictable')
    return inference_results, inference_results_df_probas, inference_label


In [49]:
inference_results,inference_results_df_probas,inference_label = inference(data_point_customer_previous_features)

Customer is 75.58% likely to purchase in the coming 1 month
