In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
# import plotly.express as px
#import plotly.offline as pyoff
# import plotly.graph_objs as go 
#import plotly.figure_factory as ff

# avoid displaying warnings
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv('cleaned_ecommerce_data.csv')
data = df.copy()
df.head()

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Customer Age,Customer Name,Gender,Churn,Purchase Year,Purchase Month
0,44605,5/3/2023,Home,177,1,177,31,John Rivera,Female,0,2023,5
1,44605,5/16/2021,Electronics,174,3,522,31,John Rivera,Female,0,2021,5
2,44605,7/13/2020,Books,413,1,413,31,John Rivera,Female,0,2020,7
3,44605,1/17/2023,Electronics,396,3,1188,31,John Rivera,Female,0,2023,1
4,44605,5/1/2021,Books,259,4,1036,31,John Rivera,Female,0,2021,5


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 12 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Customer ID            250000 non-null  int64 
 1   Purchase Date          250000 non-null  object
 2   Product Category       250000 non-null  object
 3   Product Price          250000 non-null  int64 
 4   Quantity               250000 non-null  int64 
 5   Total Purchase Amount  250000 non-null  int64 
 6   Customer Age           250000 non-null  int64 
 7   Customer Name          250000 non-null  object
 8   Gender                 250000 non-null  object
 9   Churn                  250000 non-null  int64 
 10  Purchase Year          250000 non-null  int64 
 11  Purchase Month         250000 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 22.9+ MB


In [3]:
df = df.drop(['Purchase Year','Purchase Month','Churn'],axis=1)
# def convert_to_snake_case(column_name):
#   return column_name.lower().replace(" ", "_")
# df.columns = [convert_to_snake_case(col) for col in df.columns]
# df = df.rename(columns={"total_purchase_amount": "revenue"})
df

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Customer Age,Customer Name,Gender
0,44605,5/3/2023,Home,177,1,177,31,John Rivera,Female
1,44605,5/16/2021,Electronics,174,3,522,31,John Rivera,Female
2,44605,7/13/2020,Books,413,1,413,31,John Rivera,Female
3,44605,1/17/2023,Electronics,396,3,1188,31,John Rivera,Female
4,44605,5/1/2021,Books,259,4,1036,31,John Rivera,Female
...,...,...,...,...,...,...,...,...,...
249995,33807,1/24/2023,Home,436,1,436,63,Gabriel Williams,Male
249996,20455,6/4/2021,Electronics,233,1,233,66,Barry Foster,Female
249997,28055,11/10/2022,Electronics,441,5,2205,63,Lisa Johnson,Female
249998,15023,6/27/2021,Electronics,44,2,88,64,Melissa Fernandez,Male


In [4]:
# Chuyển về format date
import datetime
df['Purchase Date'] = pd.to_datetime(df['Purchase Date']) #String->Date
current_date = max(df['Purchase Date']) + datetime.timedelta(days=1)

# Group by Customer ID to calculate R,F,M

df = df.groupby(['Customer ID']).agg({'Purchase Date': lambda x: (current_date - x.max()).days,'Customer Name':'count','Total Purchase Amount':'sum'}).reset_index(drop=False)
df.rename(columns={'Purchase Date':'RECENCY','Customer Name':'FREQUENCY','Total Purchase Amount':'MONETARY'}, inplace=True)

df["RSCORE"]  = pd.qcut(df["RECENCY"], 5, labels = [5, 4, 3, 2, 1])
df["FSCORE"]= pd.qcut(df["FREQUENCY"].rank(method="first"),5, labels=[1, 2, 3, 4, 5])
df["MSCORE"] = pd.qcut(df['MONETARY'].rank(method="first"), 5, labels = [1, 2, 3, 4, 5])
df['RFM cell'] = df['RSCORE'].astype(str) + df['FSCORE'].astype(str) + df['MSCORE'].astype(str)
print(df)


       Customer ID  RECENCY  FREQUENCY  MONETARY RSCORE FSCORE MSCORE RFM cell
0                1      289          3      5600      2      1      5      215
1                2       73          6      6459      4      4      5      445
2                3      223          4      3613      3      2      3      323
3                4      442          5      4339      1      3      4      134
4                5      425          5      2263      2      3      2      232
...            ...      ...        ...       ...    ...    ...    ...      ...
49656        49996      360          7      6107      2      5      5      255
49657        49997      389          2      1592      2      1      1      211
49658        49998       14         10      8440      5      5      5      555
49659        49999      357          6      4188      2      4      4      244
49660        50000      123          7      3871      4      5      3      453

[49661 rows x 8 columns]


In [5]:
# Đưa dữ liệu về PPC
import scipy.stats as stats
df_transform = pd.DataFrame()
 
df_transform['RECENCY'] = stats.boxcox(df['RECENCY'])[0]
df_transform['FREQUENCY'] = stats.boxcox(df['FREQUENCY'])[0]
df_transform['MONETARY'] = pd.Series(np.cbrt(df['MONETARY'])).values
df_transform

Unnamed: 0,RECENCY,FREQUENCY,MONETARY
0,13.711573,1.531362,17.758080
1,8.227962,3.131870,18.623234
2,12.512866,2.119383,15.344615
3,15.871755,2.647332,16.310348
4,15.661549,2.647332,13.128895
...,...,...,...
49656,14.796753,3.583053,18.278588
49657,15.195469,0.851451,11.676545
49658,3.885970,4.793711,20.360143
49659,14.754196,3.131870,16.118906


In [6]:
from sklearn.preprocessing import StandardScaler
df_transform_copy = df_transform.copy()
scaler = StandardScaler()
scaler.fit(df_transform_copy)

df_transform_copy = scaler.transform(df_transform_copy)
df_transform_copy = pd.DataFrame(df_transform_copy)
df_transform_copy = df_transform_copy.rename(columns={0: 'R Score', 1: 'M Score', 2: 'F Score'})
# df_transform = df.insert(1, 'CustomerID', df['Customer ID'])
df_transform_copy

Unnamed: 0,R Score,M Score,F Score
0,0.445101,-0.909238,0.850317
1,-0.751769,0.511719,1.123032
2,0.183468,-0.387183,0.089543
3,0.916590,0.081539,0.393962
4,0.870709,0.081539,-0.608898
...,...,...,...
49656,0.681956,0.912286,1.014392
49657,0.768981,-1.512875,-1.066709
49658,-1.699466,1.987128,1.670542
49659,0.672668,0.511719,0.333616


In [7]:
df_transform = pd.concat([df[['Customer ID']], df_transform], axis=1)


In [8]:
# df_transform = pd.concat([df[['Customer ID']], df_transform_copy], axis=1)
df_transform = df_transform.join(df_transform_copy)

In [9]:
df_transform

Unnamed: 0,Customer ID,RECENCY,FREQUENCY,MONETARY,R Score,M Score,F Score
0,1,13.711573,1.531362,17.758080,0.445101,-0.909238,0.850317
1,2,8.227962,3.131870,18.623234,-0.751769,0.511719,1.123032
2,3,12.512866,2.119383,15.344615,0.183468,-0.387183,0.089543
3,4,15.871755,2.647332,16.310348,0.916590,0.081539,0.393962
4,5,15.661549,2.647332,13.128895,0.870709,0.081539,-0.608898
...,...,...,...,...,...,...,...
49656,49996,14.796753,3.583053,18.278588,0.681956,0.912286,1.014392
49657,49997,15.195469,0.851451,11.676545,0.768981,-1.512875,-1.066709
49658,49998,3.885970,4.793711,20.360143,-1.699466,1.987128,1.670542
49659,49999,14.754196,3.131870,16.118906,0.672668,0.511719,0.333616


In [10]:
data = data.drop(['Purchase Year','Purchase Month','Churn'],axis=1)
import datetime
data['Purchase Date'] = pd.to_datetime(data['Purchase Date']) #String->Date
data

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Customer Age,Customer Name,Gender
0,44605,2023-05-03,Home,177,1,177,31,John Rivera,Female
1,44605,2021-05-16,Electronics,174,3,522,31,John Rivera,Female
2,44605,2020-07-13,Books,413,1,413,31,John Rivera,Female
3,44605,2023-01-17,Electronics,396,3,1188,31,John Rivera,Female
4,44605,2021-05-01,Books,259,4,1036,31,John Rivera,Female
...,...,...,...,...,...,...,...,...,...
249995,33807,2023-01-24,Home,436,1,436,63,Gabriel Williams,Male
249996,20455,2021-06-04,Electronics,233,1,233,66,Barry Foster,Female
249997,28055,2022-11-10,Electronics,441,5,2205,63,Lisa Johnson,Female
249998,15023,2021-06-27,Electronics,44,2,88,64,Melissa Fernandez,Male


In [11]:
product_dummies = pd.get_dummies(data['Product Category'])
data= data.join(product_dummies)

In [12]:
data

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Customer Age,Customer Name,Gender,Books,Clothing,Electronics,Home
0,44605,2023-05-03,Home,177,1,177,31,John Rivera,Female,False,False,False,True
1,44605,2021-05-16,Electronics,174,3,522,31,John Rivera,Female,False,False,True,False
2,44605,2020-07-13,Books,413,1,413,31,John Rivera,Female,True,False,False,False
3,44605,2023-01-17,Electronics,396,3,1188,31,John Rivera,Female,False,False,True,False
4,44605,2021-05-01,Books,259,4,1036,31,John Rivera,Female,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,33807,2023-01-24,Home,436,1,436,63,Gabriel Williams,Male,False,False,False,True
249996,20455,2021-06-04,Electronics,233,1,233,66,Barry Foster,Female,False,False,True,False
249997,28055,2022-11-10,Electronics,441,5,2205,63,Lisa Johnson,Female,False,False,True,False
249998,15023,2021-06-27,Electronics,44,2,88,64,Melissa Fernandez,Male,False,False,True,False


In [13]:
data[['Books', 'Clothing', 'Electronics', 'Home']] = data[['Books', 'Clothing', 'Electronics', 'Home']].replace({True: 1, False: 0})
data['Gender']= data['Gender'].replace({'Female': 0, 'Male': 1})
data

Unnamed: 0,Customer ID,Purchase Date,Product Category,Product Price,Quantity,Total Purchase Amount,Customer Age,Customer Name,Gender,Books,Clothing,Electronics,Home
0,44605,2023-05-03,Home,177,1,177,31,John Rivera,0,0,0,0,1
1,44605,2021-05-16,Electronics,174,3,522,31,John Rivera,0,0,0,1,0
2,44605,2020-07-13,Books,413,1,413,31,John Rivera,0,1,0,0,0
3,44605,2023-01-17,Electronics,396,3,1188,31,John Rivera,0,0,0,1,0
4,44605,2021-05-01,Books,259,4,1036,31,John Rivera,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249995,33807,2023-01-24,Home,436,1,436,63,Gabriel Williams,1,0,0,0,1
249996,20455,2021-06-04,Electronics,233,1,233,66,Barry Foster,0,0,0,1,0
249997,28055,2022-11-10,Electronics,441,5,2205,63,Lisa Johnson,0,0,0,1,0
249998,15023,2021-06-27,Electronics,44,2,88,64,Melissa Fernandez,1,0,0,1,0


In [14]:
data = data.groupby("Customer ID").aggregate({
    "Books": "sum",
    "Clothing": "sum",
    "Electronics": "sum",
    "Home": "sum",
    "Gender": "first", # Giữ nguyên giới tính của bản ghi đầu tiên
})
data.info()
# df["home_category"] = df.groupby("customer_id")["home"].transform("count")
# df["electronics_category"] = df.groupby("customer_id")["electronics"].transform("count")
# df["clothing_category"] = df.groupby("customer_id")["clothing"].transform("count")
# df['Age']= df.groupby("customer_id")["customer_age"].transform("mean")
# df["Gender"]= df.groupby("customer_id")["gender"].transform("mean")

<class 'pandas.core.frame.DataFrame'>
Index: 49661 entries, 1 to 50000
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   Books        49661 non-null  int64
 1   Clothing     49661 non-null  int64
 2   Electronics  49661 non-null  int64
 3   Home         49661 non-null  int64
 4   Gender       49661 non-null  int64
dtypes: int64(5)
memory usage: 2.3 MB


In [15]:
data

Unnamed: 0_level_0,Books,Clothing,Electronics,Home,Gender
Customer ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1,1,1,0,0
2,1,1,3,1,0
3,0,1,2,1,1
4,2,1,1,1,1
5,2,0,0,3,0
...,...,...,...,...,...
49996,3,0,1,3,0
49997,0,1,1,0,1
49998,6,2,2,0,0
49999,2,1,2,1,0


In [16]:
data = df_transform.join(data, on="Customer ID")
data

Unnamed: 0,Customer ID,RECENCY,FREQUENCY,MONETARY,R Score,M Score,F Score,Books,Clothing,Electronics,Home,Gender
0,1,13.711573,1.531362,17.758080,0.445101,-0.909238,0.850317,1,1,1,0,0
1,2,8.227962,3.131870,18.623234,-0.751769,0.511719,1.123032,1,1,3,1,0
2,3,12.512866,2.119383,15.344615,0.183468,-0.387183,0.089543,0,1,2,1,1
3,4,15.871755,2.647332,16.310348,0.916590,0.081539,0.393962,2,1,1,1,1
4,5,15.661549,2.647332,13.128895,0.870709,0.081539,-0.608898,2,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
49656,49996,14.796753,3.583053,18.278588,0.681956,0.912286,1.014392,3,0,1,3,0
49657,49997,15.195469,0.851451,11.676545,0.768981,-1.512875,-1.066709,0,1,1,0,1
49658,49998,3.885970,4.793711,20.360143,-1.699466,1.987128,1.670542,6,2,2,0,0
49659,49999,14.754196,3.131870,16.118906,0.672668,0.511719,0.333616,2,1,2,1,0


In [17]:
data

Unnamed: 0,Customer ID,RECENCY,FREQUENCY,MONETARY,R Score,M Score,F Score,Books,Clothing,Electronics,Home,Gender
0,1,13.711573,1.531362,17.758080,0.445101,-0.909238,0.850317,1,1,1,0,0
1,2,8.227962,3.131870,18.623234,-0.751769,0.511719,1.123032,1,1,3,1,0
2,3,12.512866,2.119383,15.344615,0.183468,-0.387183,0.089543,0,1,2,1,1
3,4,15.871755,2.647332,16.310348,0.916590,0.081539,0.393962,2,1,1,1,1
4,5,15.661549,2.647332,13.128895,0.870709,0.081539,-0.608898,2,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
49656,49996,14.796753,3.583053,18.278588,0.681956,0.912286,1.014392,3,0,1,3,0
49657,49997,15.195469,0.851451,11.676545,0.768981,-1.512875,-1.066709,0,1,1,0,1
49658,49998,3.885970,4.793711,20.360143,-1.699466,1.987128,1.670542,6,2,2,0,0
49659,49999,14.754196,3.131870,16.118906,0.672668,0.511719,0.333616,2,1,2,1,0


In [18]:

# Get the distinct customers in the dataframe ctm_bhvr_dt
DataFrame = pd.read_csv('cleaned_ecommerce_data.csv')
df_data = DataFrame.drop(['Purchase Year','Purchase Month','Churn'],axis=1)
def convert_to_snake_case(column_name):
  return column_name.lower().replace(" ", "_")
df_data.columns = [convert_to_snake_case(col) for col in df_data.columns]
df_data = df_data.rename(columns={"total_purchase_amount": "revenue"})
df_data.purchase_date = pd.to_datetime(df_data.purchase_date)
ctm_bhvr_dt = df_data[(df_data.purchase_date >= pd.Timestamp(2020, 1, 1)) & (df_data.purchase_date <= pd.Timestamp(2023, 5, 31))].reset_index(drop=True)


ctm_next_quarter = df_data[(df_data.purchase_date < 
                            pd.Timestamp(2023,9,30)) & 
                            (df_data.purchase_date >= pd.Timestamp(2023,6,1))].reset_index(drop=True)
print(ctm_bhvr_dt.info())


ctm_dt = pd.DataFrame(ctm_bhvr_dt['customer_id'].unique())

# Rename the column to CustomerID.
ctm_dt.columns = ['customer_id']

ctm_dt.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230701 entries, 0 to 230700
Data columns (total 9 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   customer_id       230701 non-null  int64         
 1   purchase_date     230701 non-null  datetime64[ns]
 2   product_category  230701 non-null  object        
 3   product_price     230701 non-null  int64         
 4   quantity          230701 non-null  int64         
 5   revenue           230701 non-null  int64         
 6   customer_age      230701 non-null  int64         
 7   customer_name     230701 non-null  object        
 8   gender            230701 non-null  object        
dtypes: datetime64[ns](1), int64(5), object(3)
memory usage: 15.8+ MB
None


Unnamed: 0,customer_id
0,44605
1,13738
2,33969
3,42650
4,24053


In [19]:
ctm_1st_purchase_in_next_quarter = ctm_next_quarter.groupby('customer_id').purchase_date.min().reset_index()
ctm_1st_purchase_in_next_quarter.columns = ['customer_id','MinPurchaseDate']
ctm_1st_purchase_in_next_quarter.head()

Unnamed: 0,customer_id,MinPurchaseDate
0,2,2023-07-03
1,8,2023-06-10
2,11,2023-06-03
3,15,2023-06-10
4,17,2023-06-03


In [20]:
ctm_last_purchase_bhvr_dt = ctm_bhvr_dt.groupby('customer_id').purchase_date.max().reset_index()
ctm_last_purchase_bhvr_dt.columns = ['customer_id','MaxPurchaseDate']
ctm_last_purchase_bhvr_dt.head()

Unnamed: 0,customer_id,MaxPurchaseDate
0,1,2022-11-29
1,2,2023-04-26
2,3,2023-02-03
3,4,2022-06-29
4,5,2022-07-16


In [21]:
ctm_purchase_dates = pd.merge(ctm_last_purchase_bhvr_dt, ctm_1st_purchase_in_next_quarter, on='customer_id', 
                              how='left')
ctm_purchase_dates.head()
ctm_purchase_dates['NextPurchaseDay'] = (ctm_purchase_dates['MinPurchaseDate'] - ctm_purchase_dates['MaxPurchaseDate']).dt.days

ctm_purchase_dates.head()
# merge with ctm_dt 
ctm_dt = pd.merge(ctm_dt, ctm_purchase_dates[['customer_id','NextPurchaseDay']], on='customer_id', how='left')
ctm_dt.head()

Unnamed: 0,customer_id,NextPurchaseDay
0,44605,
1,13738,166.0
2,33969,127.0
3,42650,
4,24053,


In [22]:
ctm_dt = ctm_dt.fillna(9999)
ctm_dt.head()

Unnamed: 0,customer_id,NextPurchaseDay
0,44605,9999.0
1,13738,166.0
2,33969,127.0
3,42650,9999.0
4,24053,9999.0


In [23]:
ctm_dt.rename(columns={'customer_id': 'Customer ID'}, inplace=True)


In [24]:
data

Unnamed: 0,Customer ID,RECENCY,FREQUENCY,MONETARY,R Score,M Score,F Score,Books,Clothing,Electronics,Home,Gender
0,1,13.711573,1.531362,17.758080,0.445101,-0.909238,0.850317,1,1,1,0,0
1,2,8.227962,3.131870,18.623234,-0.751769,0.511719,1.123032,1,1,3,1,0
2,3,12.512866,2.119383,15.344615,0.183468,-0.387183,0.089543,0,1,2,1,1
3,4,15.871755,2.647332,16.310348,0.916590,0.081539,0.393962,2,1,1,1,1
4,5,15.661549,2.647332,13.128895,0.870709,0.081539,-0.608898,2,0,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
49656,49996,14.796753,3.583053,18.278588,0.681956,0.912286,1.014392,3,0,1,3,0
49657,49997,15.195469,0.851451,11.676545,0.768981,-1.512875,-1.066709,0,1,1,0,1
49658,49998,3.885970,4.793711,20.360143,-1.699466,1.987128,1.670542,6,2,2,0,0
49659,49999,14.754196,3.131870,16.118906,0.672668,0.511719,0.333616,2,1,2,1,0


In [25]:
ctm_dt.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
data.rename(columns={'Customer ID': 'CustomerID'}, inplace=True)
print(ctm_dt.columns,data.columns)


Index(['CustomerID', 'NextPurchaseDay'], dtype='object') Index(['CustomerID', 'RECENCY', 'FREQUENCY', 'MONETARY', 'R Score', 'M Score',
       'F Score', 'Books', 'Clothing', 'Electronics', 'Home', 'Gender'],
      dtype='object')


In [26]:
data = pd.merge(data, ctm_dt[['CustomerID', 'NextPurchaseDay']], on='CustomerID', how='left')
data

Unnamed: 0,CustomerID,RECENCY,FREQUENCY,MONETARY,R Score,M Score,F Score,Books,Clothing,Electronics,Home,Gender,NextPurchaseDay
0,1,13.711573,1.531362,17.758080,0.445101,-0.909238,0.850317,1,1,1,0,0,9999.0
1,2,8.227962,3.131870,18.623234,-0.751769,0.511719,1.123032,1,1,3,1,0,68.0
2,3,12.512866,2.119383,15.344615,0.183468,-0.387183,0.089543,0,1,2,1,1,9999.0
3,4,15.871755,2.647332,16.310348,0.916590,0.081539,0.393962,2,1,1,1,1,9999.0
4,5,15.661549,2.647332,13.128895,0.870709,0.081539,-0.608898,2,0,0,3,0,9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
49656,49996,14.796753,3.583053,18.278588,0.681956,0.912286,1.014392,3,0,1,3,0,9999.0
49657,49997,15.195469,0.851451,11.676545,0.768981,-1.512875,-1.066709,0,1,1,0,1,9999.0
49658,49998,3.885970,4.793711,20.360143,-1.699466,1.987128,1.670542,6,2,2,0,0,103.0
49659,49999,14.754196,3.131870,16.118906,0.672668,0.511719,0.333616,2,1,2,1,0,9999.0


In [27]:
data['NextPurchaseDayRange'] = 1  ## less than 3 months
data.loc[data.NextPurchaseDay>90,'NextPurchaseDayRange'] = 0 # more than 3 months
data.head(50)

Unnamed: 0,CustomerID,RECENCY,FREQUENCY,MONETARY,R Score,M Score,F Score,Books,Clothing,Electronics,Home,Gender,NextPurchaseDay,NextPurchaseDayRange
0,1,13.711573,1.531362,17.75808,0.445101,-0.909238,0.850317,1,1,1,0,0,9999.0,0
1,2,8.227962,3.13187,18.623234,-0.751769,0.511719,1.123032,1,1,3,1,0,68.0,1
2,3,12.512866,2.119383,15.344615,0.183468,-0.387183,0.089543,0,1,2,1,1,9999.0,0
3,4,15.871755,2.647332,16.310348,0.91659,0.081539,0.393962,2,1,1,1,1,9999.0,0
4,5,15.661549,2.647332,13.128895,0.870709,0.081539,-0.608898,2,0,0,3,0,9999.0,0
5,6,10.592656,4.409929,21.13369,-0.235644,1.646401,1.91438,2,3,2,2,0,9999.0,0
6,7,10.648102,4.007518,16.437165,-0.223542,1.289134,0.433938,4,1,2,1,1,9999.0,0
7,8,5.286626,3.583053,20.158737,-1.393755,0.912286,1.607054,2,1,2,2,1,77.0,1
8,9,10.62045,3.583053,16.773228,-0.229577,0.912286,0.539872,3,1,2,1,0,9999.0,0
9,10,10.306551,4.007518,18.845469,-0.29809,1.289134,1.193085,3,0,5,0,0,9999.0,0


In [28]:
corr_matrix = data[data.columns].corr()
corr_df = pd.DataFrame(corr_matrix.min())
corr_df.columns = ['MinCorrelationCoeff']
corr_df['MaxCorrelationCoeff'] = corr_matrix[corr_matrix < 1].max()
corr_df

Unnamed: 0,MinCorrelationCoeff,MaxCorrelationCoeff
CustomerID,-0.007047,0.009855
RECENCY,-0.392811,1.0
FREQUENCY,-0.392811,1.0
MONETARY,-0.318086,1.0
R Score,-0.392811,1.0
M Score,-0.392811,1.0
F Score,-0.318086,1.0
Books,-0.180778,0.487071
Clothing,-0.189141,0.486414
Electronics,-0.186475,0.490013


In [29]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, cross_validate
from multiscorer import MultiScorer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.cluster import KMeans
import xgboost as xgb
import time

In [30]:
ctm_class = data.copy()
ctm_class.drop(columns=['Books', 'Clothing', 'Electronics', 'Home', 'Gender'], inplace=True)

ctm_class = ctm_class.dropna(subset=['NextPurchaseDay'])

print(data.shape)

X, y = ctm_class.drop('NextPurchaseDayRange', axis=1), ctm_class.NextPurchaseDayRange
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=None, shuffle=True)

(49661, 14)


In [31]:
# Create an array of models
models = []
models.append(("LogisticRegression", LogisticRegression()))
models.append(("GaussianNB", GaussianNB()))
models.append(("RandomForestClassifier", RandomForestClassifier()))
models.append(("SVC", SVC()))
models.append(("DecisionTreeClassifier", DecisionTreeClassifier()))
models.append(("xgb.XGBClassifier", xgb.XGBClassifier(eval_metric='mlogloss')))
models.append(("KNeighborsClassifier", KNeighborsClassifier()))

In [32]:

print(X_train.isnull().sum())

CustomerID         0
RECENCY            0
FREQUENCY          0
MONETARY           0
R Score            0
M Score            0
F Score            0
NextPurchaseDay    0
dtype: int64


In [33]:
scorer = MultiScorer({'accuracy'  : (accuracy_score , {}), 
                      'f1_score'  : (f1_score       , {'pos_label': 3, 'average':'macro'}), 
                      'recall'    : (recall_score   , {'pos_label': 3, 'average':'macro'}), 
                      'precision' : (precision_score, {'pos_label': 3, 'average':'macro'})
                     })

# A dictionary for all the distinct models and their respective metrics
model_scores_dict = {'model_name' : [], 
                     'accuracy'   : [], 
                     'f1_score'   : [], 
                     'recall'     : [], 
                     'precision'  : [],
                     'time'       : []
                    }

# For each model name and model in models
for model_name, model in models: 
    
    # Add model_name to model_scores_dict 
    model_scores_dict['model_name'].append(model_name)
    #print(model_name)
    kfold = KFold(n_splits=2, random_state=24, shuffle=True)
    start = time.time()
    _ = cross_val_score(model, X_train, y_train, cv = kfold, scoring = scorer)
    cv_result = scorer.get_results()
    
    # For each metric in cv_result.keys()
    for metric_name in cv_result.keys():
        # Get the average of cv_result[metric_name]
        average_score = np.average(cv_result[metric_name])
        # Update model_scores_dict with average_score for model_name
        model_scores_dict[metric_name].append(average_score)
        #print('%s : %f' %(metric_name, average_score))

    model_scores_dict['time'].append((time.time() - start))
    #print('time : ', time.time() - start, '\n\n')
        
model_score_df = pd.DataFrame(model_scores_dict).set_index("model_name")
model_score_df.sort_values(by=["accuracy", "f1_score", "time"], ascending=False)

Unnamed: 0_level_0,accuracy,f1_score,recall,precision,time
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LogisticRegression,0.995411,0.973822,0.977643,0.970086,0.588904
RandomForestClassifier,0.985137,0.936334,0.981751,0.91246,3.269191
xgb.XGBClassifier,0.98481,0.882365,0.907397,0.868202,1.372239
KNeighborsClassifier,0.982989,0.872412,0.887322,0.870184,3.345731
DecisionTreeClassifier,0.981881,0.859446,0.889051,0.842875,0.120307
GaussianNB,0.977705,0.9045,0.972627,0.86869,0.086601
SVC,0.977351,0.824307,0.861313,0.803594,11.072353
