# PA005 - High Value Customer Indetification

In [1]:
import re

import pandas     as pd
import numpy      as np
import seaborn    as sns
import umap.umap_ as umap

from sklearn.manifold     import TSNE
from sklearn.neighbors    import NearestNeighbors

from sklearn              import decomposition as dd
from matplotlib           import pyplot        as plt
from sklearn              import cluster       as c
from sklearn              import metrics       as m
from plotly               import express       as px
from sklearn              import preprocessing as pp
from sklearn              import ensemble      as en
from sklearn              import mixture       as mx
from scipy.cluster        import hierarchy     as hc

## Loading Data

In [2]:
df_raw = pd.read_csv('C:/Users/Pedro/repos/pa005_insiders_clustering/insiders_cluster/data/raw/data.csv', encoding='unicode_escape')

# 1.0 Data Description

In [3]:
df1 = df_raw.copy()

## 1.1 Rename Columns

In [4]:
cols_new = ['invoice_no', 'stock_code', 'description', 'quantity', 'invoice_date',
       'unit_price', 'customer_id', 'country']

df1.columns = cols_new

## 1.2 Data Dimensions

In [5]:
print('The number of rows is: {}'.format( df1.shape[0] ))
print('The number of columns is: {}'.format( df1.shape[1] ))

The number of rows is: 541909
The number of columns is: 8


## 1.3 Data Types

In [6]:
df1.dtypes

invoice_no       object
stock_code       object
description      object
quantity          int64
invoice_date     object
unit_price      float64
customer_id     float64
country          object
dtype: object

## 1.4 Check NA

In [7]:
df1.isna().sum()

invoice_no           0
stock_code           0
description       1454
quantity             0
invoice_date         0
unit_price           0
customer_id     135080
country              0
dtype: int64

## 1.5 Replace NA

In [8]:
# Spliting NA's into two differents dataframes

df_missing = df1.loc[ df1['customer_id'].isna(), : ]
df_not_missing = df1.loc[~df1['customer_id'].isna(), :]

In [9]:
# Creating Reference
df_backup = pd.DataFrame( df_missing['invoice_no'].drop_duplicates() )
df_backup['customer_id'] = np.arange( 19000, 19000+len( df_backup ), 1 )

# Merge original with reference dataframe
df1 = pd.merge( df1, df_backup, on='invoice_no', how='left' )

# coalesce
df1['customer_id'] = df1['customer_id_x'].combine_first( df1['customer_id_y'] )

# Dropping extra columns
df1 = df1.drop(columns=['customer_id_x', 'customer_id_y'], axis=1)

## 1.6 Change Types

In [10]:
# Invoice Date
df1['invoice_date'] = pd.to_datetime( df1['invoice_date'] )

# Customer ID
df1['customer_id'] = df1['customer_id'].astype(np.int64)

# 2.0 Filtragem de Variaveis

In [11]:
df2 = df1.copy()

In [12]:
# ========= Numerical Attributes ==========

# Pegando preços unitários maiores que 0.04
df2 = df2.loc[ df2['unit_price'] >= 0.04, : ]


# ========= Categorical Attributes ==========

# Removing Stock Code letters
df2 = df2[~df2['stock_code'].isin( ['POST', 'D', 'DOT', 'M', 'S', 'AMAZONFEE', 'm', 'DCGSSBOY', 'DCGSSGIRL', 'PADS', 'B', 'CRUK'] ) ]

# Description
df2 = df2.drop( columns=['description'], axis=1 )

# Map
df2 = df2[~df2['country'].isin( [ 'european Community', 'Unpecified' ] ) ]


# Bad Users
df2 = df2[~df2['customer_id'].isin( [16446] )]


# Dividindo datasets em compras e retornos
df2_purchases = df2.loc[df2['quantity'] >= 0, :]
df2_returns = df2.loc[df2['quantity'] < 0, : ]

# 3.0. Feature Engineering

In [13]:
df3 = df2.copy()

## 3.1. Feature Creation

In [14]:
# Data reference
df_ref = df3.drop( ['invoice_no', 'stock_code', 'quantity', 'invoice_date', 'unit_price', 'country'], axis=1 ).drop_duplicates(ignore_index=True)

### 3.1.1 Gross Revenue

In [15]:
# Gross Revenue (Faturamento) quantity * price
df2_purchases.loc[:, 'gross_revenue'] = df2_purchases.loc[:, 'quantity'] * df2_purchases.loc[:, 'unit_price']

# Monetary
df_monetary = df2_purchases.loc[:, ['customer_id', 'gross_revenue']].groupby('customer_id').sum().reset_index()
df_ref = pd.merge( df_ref, df_monetary, on='customer_id', how='left' )
df_ref.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2_purchases.loc[:, 'gross_revenue'] = df2_purchases.loc[:, 'quantity'] * df2_purchases.loc[:, 'unit_price']


customer_id       0
gross_revenue    91
dtype: int64

### 3.1.2 Recency - Days from last purchase

In [16]:
# Recency - Last Day Purchase
df_recency = df2_purchases.loc[:, ['customer_id', 'invoice_date']].groupby('customer_id').max().reset_index()
df_recency['recency_days'] = ( df3['invoice_date'].max() - df_recency['invoice_date'] ).dt.days
df_recency = df_recency[['customer_id', 'recency_days']].copy()
df_ref = pd.merge( df_ref, df_recency, on='customer_id', how='left' )
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
dtype: int64

### 3.1.5 Quantity of products purchased (variety)

In [17]:
df_stock = df2_purchases.loc[:, ['customer_id', 'stock_code']].groupby('customer_id').count().reset_index().rename(columns={'stock_code':'qnt_products'})
df_ref = pd.merge( df_ref, df_stock, on='customer_id', how='left' )
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qnt_products     91
dtype: int64

### 3.1.8 Frequency Purchase

In [18]:
df_aux = ( df2_purchases[['customer_id', 'invoice_no', 'invoice_date']].drop_duplicates()
                                                             .groupby('customer_id')
                                                             .agg( max_ = ( 'invoice_date', 'max' ),
                                                                   min_ = ( 'invoice_date', 'min' ),
                                                                   days_ = ( 'invoice_date', lambda x: ( ( x.max() - x.min() ).days ) + 1 ),
                                                                   buy_ = ( 'invoice_no', 'count') ) ).reset_index()

df_aux['frequency'] = df_aux[['buy_', 'days_']].apply( lambda x: x['buy_'] / x['days_'] if x['days_'] != 0 else 0, axis=1 )


df_ref = pd.merge( df_ref, df_aux[['customer_id', 'frequency']], on='customer_id', how='left' )

df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qnt_products     91
frequency        91
dtype: int64

### 3.1.9 Number of Returns

In [19]:
df_returns = df2_returns[['customer_id', 'quantity']].groupby( 'customer_id' ).sum().reset_index().rename(columns={'quantity': 'qnt_returns'})
df_returns['qnt_returns'] = df_returns['qnt_returns'] * -1

df_ref = pd.merge( df_ref, df_returns, how='left', on='customer_id' )
df_ref.loc[df_ref['qnt_returns'].isna(), 'qnt_returns'] = 0
df_ref.isna().sum()

customer_id       0
gross_revenue    91
recency_days     91
qnt_products     91
frequency        91
qnt_returns       0
dtype: int64

# 4.0. Exploratory Data Analysis

In [20]:
df4 = df_ref.dropna().copy()

## 4.3. Estudo do Espaço

In [21]:
# Selected Features
cols_selected = ['customer_id', 'gross_revenue', 'recency_days', 'qnt_products', 'frequency', 'qnt_returns']
df43 = df4[cols_selected].copy()

In [22]:
mm = pp.MinMaxScaler()

df43['gross_revenue']          = mm.fit_transform( df43[['gross_revenue']] )
df43['recency_days']           = mm.fit_transform( df43[['recency_days']] )
df43['qnt_products']           = mm.fit_transform( df43[['qnt_products']] )
df43['frequency']              = mm.fit_transform( df43[['frequency']] )
df43['qnt_returns']            = mm.fit_transform( df43[['qnt_returns']] )

### Tree-Based Embedding

In [23]:
# Training Dataset
X = df43.drop( columns=['gross_revenue', 'customer_id'], axis=1 )
y = df43['gross_revenue']

# Model Definition
rf_model = en.RandomForestRegressor( n_estimators=100, random_state=42 )

# Model Training
rf_model.fit( X, y )

# Leaf
df_leaf = pd.DataFrame( rf_model.apply( X ) )

In [24]:
reducer = umap.UMAP( random_state=42 )
embedding = reducer.fit_transform( df_leaf )

# Embedding
df_tree = pd.DataFrame()
df_tree['embedding_x'] = embedding[:, 0]
df_tree['embedding_y'] = embedding[:, 1]



# 7.0. Hyper Parameter Fine Tuning

In [25]:
X = df_tree.copy()

# 8.0 Model Training

## 8.1. K-Means

In [33]:
# Model Definition
k = 8
kmeans = c.KMeans( init='random', n_clusters=k, n_init=300, max_iter=300, random_state=42 )

# Model Training
kmeans.fit( X )

# Clustering
labels = kmeans.labels_

## 8.2. Cluster Validation

In [34]:
# SS (Silhouette Score)
print( 'SS Value: {}'.format( m.silhouette_score( X, labels, metric='euclidean' ) ) )

SS Value: 0.4400531053543091


# 9.0 Cluster Analyses

In [35]:
df9 = X.copy()
df9['clusters'] = labels
df9.head()

Unnamed: 0,embedding_x,embedding_y,clusters
0,8.913966,22.450937,0
1,10.913033,22.796009,0
2,9.891003,22.673065,0
3,-4.264469,3.080198,6
4,15.201887,-1.125108,3


## 9.4. Cluster Profile

In [47]:
# Creating DataFrame
df92 = df4.copy()
df92['clusters'] = labels


# Change Data Types
df92['recency_days'] = df92['recency_days'].astype( np.int64 )
df92['clusters'] = df92['clusters'].astype( np.int64 )
df92['qnt_products'] = df92['qnt_products'].astype( np.int64 )
df92['qnt_returns'] = df92['qnt_returns'].astype( np.int64 )

In [48]:
# Numbers of Customers
df_cluster = df92[['clusters', 'customer_id']].groupby('clusters').count().reset_index().rename(columns={'customer_id': 'qnt_customers'})
df_cluster['perc_customer'] = 100 * ( df_cluster['qnt_customers'] / df_cluster['qnt_customers'].sum() )

# Avg Gross Revenue
df_avg_gross_revenue = df92[['clusters', 'gross_revenue']].groupby( 'clusters' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_gross_revenue, on='clusters', how='inner' )

# Avg Recency Days
df_avg_recency_days = df92[['clusters', 'recency_days']].groupby( 'clusters' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_recency_days, on='clusters', how='inner' )

# Avg Invoice_no
df_avg_products = df92[['clusters', 'qnt_products']].groupby( 'clusters' ).mean().reset_index()
df_cluster = pd.merge( df_cluster, df_avg_products, on='clusters', how='inner' )

# Frequecy
df_frequency = df92[['clusters', 'frequency']].groupby('clusters').mean().reset_index()
df_cluster = pd.merge( df_cluster, df_frequency, on='clusters', how='inner' )

# qnt_returns
df_returns = df92[['clusters', 'qnt_returns']].groupby('clusters').mean().reset_index()
df_cluster = pd.merge( df_cluster, df_returns, on='clusters', how='inner' )

In [49]:
df_cluster

Unnamed: 0,clusters,qnt_customers,perc_customer,gross_revenue,recency_days,qnt_products,frequency,qnt_returns
0,0,1141,20.0,5630.984575,56.370727,292.321648,0.254526,138.811569
1,1,881,15.442594,1770.314064,109.505108,114.595914,0.511919,2.986379
2,2,703,12.322524,305.592916,151.820768,18.74111,1.017402,2.146515
3,3,676,11.849255,494.938802,115.313609,14.431953,0.409867,4.051775
4,4,613,10.744961,1101.094992,71.269168,45.831974,0.036962,7.228385
5,5,651,11.411043,597.010169,174.360983,30.376344,1.023331,0.40553
6,6,509,8.921998,520.030079,122.986248,19.066798,0.583331,10.950884
7,7,531,9.307625,467.489058,185.632768,24.312618,0.811349,4.789077


- 1. Cluster Insiders
- 6. Cluster Spend Money 
- 2. Cluster Spend More Money
- 7. Cluster Products and Money
- 3. Cluster More Products and Money
- 0. Cluster Money Products and Recency
- 4. Cluster Weakest
- 5. Cluster Best Frequency

# 11.0 Deploy to Production

In [50]:
df92.dtypes

customer_id        int64
gross_revenue    float64
recency_days       int64
qnt_products       int64
frequency        float64
qnt_returns        int64
clusters           int64
dtype: object

## 11.1 Insert into SQLite

In [58]:
import sqlite3
from sqlalchemy import create_engine

In [61]:
# Create Table
#query_create_table_insiders = """
        
#        CREATE TABLE insiders (
#        customer_id     INTERGER,
#        gross_revenue   REAL, 
#        recency_days    INTEGER,
#        qnt_products    INTEGER,
#        frequency       REAL,
#        qnt_returns     INTEGER,
#        clusters        INTEGER
#        )  
#"""


#conn = sqlite3.connect('insiders_db.sqlite')
#conn.execute( query_create_table_insiders )
#conn.commit()
#conn.close()


# Insert Data
conn = create_engine('sqlite:///insiders_db.sqlite')
#df92.to_sql( 'insiders', con=conn, if_exists='append', index=False )

# Select Data

In [62]:
# Consulting Database
query = """
    SELECT * FROM insiders
"""

df = pd.read_sql_query( query, conn )

In [64]:
df.shape

(5705, 7)