In [1]:
from env import conn
import pandas as pd
import numpy as np
import matplotlib as plt
%matplotlib inline
import datetime as dt
import plotly.express as px


In [2]:
def return_df(query):
    cs = conn.cursor()
    query = query
    result = cs.execute(query)
    df = result.fetch_pandas_all()
    cs.close()
    return df


# Investigating launch for Stonegate

In [3]:
tables = return_df(
    """
    SELECT TABLE_SCHEMA, TABLE_NAME
FROM INFORMATION_SCHEMA.TABLES
WHERE TABLE_TYPE = 'BASE TABLE'
order by table_schema, table_name;    
    """
        )
tables


Unnamed: 0,TABLE_SCHEMA,TABLE_NAME
0,JIRA,JIRA__SPRINT__SPRINT
1,JIRA,JIRA__TEAM__SPRINT
2,LOYALTY_CARDS,LC201__LOYALTY_CARD_JOURNEY_FUNNEL__USER_LEVEL...
3,LOYALTY_CARDS,LC__ERRORS__DAILY_STATUS_ROLLUP_USER_LEVEL
4,LOYALTY_CARDS,LC__LINKS_JOINS__DAILY_RETAILER
5,LOYALTY_CARDS,LC__LINKS_JOINS__DAILY_RETAILER_CHANNEL
6,LOYALTY_CARDS,LC__LINKS_JOINS__DAILY_RETAILER_CHANNEL__FORECAST
7,LOYALTY_CARDS,LC__LINKS_JOINS__MONTHLY_RETAILER
8,LOYALTY_CARDS,LC__LINKS_JOINS__MONTHLY_RETAILER_CHANNEL
9,LOYALTY_CARDS,LC__LINKS_JOINS__MONTHLY_RETAILER_CHANNEL__GROWTH


### Generating dataframnes for analysis

In [4]:
lc_daily_metrics = return_df(
    """select * from METRICS.LOYALTY_CARDS.LC__LINKS_JOINS__DAILY_RETAILER
        where loyalty_plan_company = 'Stonegate Group'
        order by date;    
    """
        )

txn_daily_metrics = return_df(
    """select * from METRICS.TRANSACTIONS.TRANS__TRANS__DAILY_RETAILER
        where loyalty_plan_company = 'Stonegate Group'
         order by date;    
    """
        )

user_level_txns = return_df(
    """select * from PROD.BINK_SECURE.FACT_TRANSACTION
        where loyalty_plan_company = 'Stonegate Group'
         order by event_date_time;    
    """
        )


## Categorising users based on txns


### Inspecting and Cleaning data

In [5]:
user_level_txns.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139554 entries, 0 to 139553
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   EVENT_ID               139554 non-null  int32         
 1   EVENT_DATE_TIME        139554 non-null  datetime64[ns]
 2   USER_ID                139554 non-null  object        
 3   EXTERNAL_USER_REF      139554 non-null  object        
 4   CHANNEL                139554 non-null  object        
 5   BRAND                  139554 non-null  object        
 6   TRANSACTION_ID         139554 non-null  object        
 7   PROVIDER_SLUG          139554 non-null  object        
 8   FEED_TYPE              0 non-null       object        
 9   DUPLICATE_TRANSACTION  139554 non-null  bool          
 10  LOYALTY_PLAN_NAME      139554 non-null  object        
 11  LOYALTY_PLAN_COMPANY   139554 non-null  object        
 12  TRANSACTION_DATE       139554 non-null  date

### Creating features for analysis

In [6]:
#hash external_user_ref
user_level_txns["EXTERNAL_USER_REF"] = user_level_txns["EXTERNAL_USER_REF"].apply(hash)

user_level_txns_agg = user_level_txns.groupby(by="EXTERNAL_USER_REF", ).agg(
    TRANSACTION_ID_count=('TRANSACTION_ID', 'count'),
    SPEND_AMOUNT_sum = ('SPEND_AMOUNT','sum'),
    SPEND_AMOUNT_mean = ('SPEND_AMOUNT','mean'),
    SPEND_AMOUNT_min = ('SPEND_AMOUNT','min', ),
    SPEND_AMOUNT_max = ('SPEND_AMOUNT','max', ),
    SPEND_AMOUNT_range = ('SPEND_AMOUNT',lambda x: x.max() - x.min()),
    EVENT_DATE_TIME_range = ('EVENT_DATE_TIME', lambda x: (x.max() - x.min()).days),
    EVENT_DATE_TIME_max = ('EVENT_DATE_TIME', 'min'),
    EVENT_DATE_TIME_min = ('EVENT_DATE_TIME', 'max'),
)

user_level_txns_agg


Unnamed: 0_level_0,TRANSACTION_ID_count,SPEND_AMOUNT_sum,SPEND_AMOUNT_mean,SPEND_AMOUNT_min,SPEND_AMOUNT_max,SPEND_AMOUNT_range,EVENT_DATE_TIME_range,EVENT_DATE_TIME_max,EVENT_DATE_TIME_min
EXTERNAL_USER_REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
-9222797808601043830,3,14.30,4.766667,0.70,10.45,9.75,4,2023-11-15 07:47:27.116623,2023-11-19 16:10:17.720062
-9222787950909804758,9,49.41,5.490000,1.95,13.39,11.44,17,2023-11-11 06:16:44.226618,2023-11-28 11:05:37.285751
-9221030177566349485,5,15.35,3.070000,1.45,4.15,2.70,0,2023-11-28 10:15:24.399811,2023-11-28 10:15:30.759386
-9219915394050613945,7,46.70,6.671429,3.65,8.75,5.10,7,2023-11-15 14:48:51.795368,2023-11-23 07:44:28.855924
-9219309818027451022,5,15.46,3.092000,1.00,4.50,3.50,9,2023-11-18 11:16:14.319983,2023-11-28 03:18:47.892417
...,...,...,...,...,...,...,...,...,...
9221385129529590744,23,128.30,5.578261,2.55,13.10,10.55,16,2023-11-08 08:59:43.036754,2023-11-24 10:55:19.689918
9221485181004617911,4,33.08,8.270000,4.55,11.99,7.44,4,2023-11-23 02:43:17.536174,2023-11-27 07:35:15.732064
9221690631932845443,2,28.70,14.350000,10.25,18.45,8.20,8,2023-11-11 07:18:31.415399,2023-11-19 16:06:35.003837
9222336799614184287,3,15.00,5.000000,3.00,9.00,6.00,0,2023-11-17 04:16:38.195863,2023-11-17 04:16:38.829915


In [7]:
user_level_txns_agg.info()


<class 'pandas.core.frame.DataFrame'>
Index: 22920 entries, -9222797808601043830 to 9222587754733527419
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   TRANSACTION_ID_count   22920 non-null  int64         
 1   SPEND_AMOUNT_sum       22920 non-null  float64       
 2   SPEND_AMOUNT_mean      22920 non-null  float64       
 3   SPEND_AMOUNT_min       22920 non-null  float64       
 4   SPEND_AMOUNT_max       22920 non-null  float64       
 5   SPEND_AMOUNT_range     22920 non-null  float64       
 6   EVENT_DATE_TIME_range  22920 non-null  int64         
 7   EVENT_DATE_TIME_max    22920 non-null  datetime64[ns]
 8   EVENT_DATE_TIME_min    22920 non-null  datetime64[ns]
dtypes: datetime64[ns](2), float64(5), int64(2)
memory usage: 1.7 MB


In [8]:
user_level_txns.describe()


Unnamed: 0,EVENT_ID,EVENT_DATE_TIME,EXTERNAL_USER_REF,TRANSACTION_DATE,SPEND_AMOUNT,INSERTED_DATE_TIME,UPDATED_DATE_TIME
count,139554.0,139554,139554.0,139554,139554.0,139554,139554
mean,15269620.0,2023-11-20 21:11:44.105058560,-1.463546e+17,2023-11-18 18:45:12.785846528,7.548613,2023-11-27 19:50:06.239813888,2023-11-27 19:50:06.239813888
min,13961990.0,2023-10-28 04:03:13.085397,-9.222798e+18,2023-10-27 19:42:22,0.01,2023-11-27 11:44:28.631000,2023-11-27 11:44:28.631000
25%,14951930.0,2023-11-16 22:37:04.909153024,-4.65504e+18,2023-11-12 21:55:21,3.6,2023-11-27 11:44:28.631000064,2023-11-27 11:44:28.631000064
50%,15277420.0,2023-11-21 17:49:31.717180928,-2.256061e+17,2023-11-18 20:20:34,5.25,2023-11-27 11:44:28.631000064,2023-11-27 11:44:28.631000064
75%,15680880.0,2023-11-25 09:31:31.277750528,4.271869e+18,2023-11-24 17:17:38.750000128,9.05,2023-11-28 02:13:44.625999872,2023-11-28 02:13:44.625999872
max,15866280.0,2023-11-29 11:48:00.871752,9.222588e+18,2023-11-29 02:45:39,956.7,2023-11-29 12:14:07.795000,2023-11-29 12:14:07.795000
std,411471.4,,5.266853e+18,,8.894217,,


### Further feature selection
ATV, modal txn value, time of day, stores visited?, 

In [41]:
user_level_txns_agg['ATF'] = user_level_txns_agg['TRANSACTION_ID_count']/user_level_txns_agg['EVENT_DATE_TIME_range']


In [42]:
px.scatter(user_level_txns_agg['SPEND_AMOUNT_sum'])


### Creating an algorithm
The goal is to categorise our users based on their features
For this we will explore a simple Model called K-means clustering

In [37]:
import sklearn
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler



#dropping timesersies data points
user_level_txns_agg_reduced = user_level_txns_agg.drop(['EVENT_DATE_TIME_max', 'EVENT_DATE_TIME_min'], axis=1)


##### Scale the data

In [27]:
#Scale the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(user_level_txns_agg_reduced)
scaled_data


array([[-0.3033688 , -0.43143771, -0.39317807, ..., -0.19036098,
         0.16837922, -0.16460742],
       [ 0.28593647,  0.04699019, -0.32813487, ..., -0.01863318,
         0.28405721,  1.81458741],
       [-0.10693371, -0.41712984, -0.54574482, ..., -0.55834914,
        -0.3141828 , -0.77359044],
       ...,
       [-0.40158634, -0.2352155 ,  0.46856933, ...,  0.27692556,
         0.06228403,  0.4443756 ],
       [-0.3033688 , -0.42189913, -0.37219639, ..., -0.27505667,
        -0.0883027 , -0.77359044],
       [-0.40158634, -0.41167923, -0.11367217, ..., -0.18744044,
        -0.13963909, -0.77359044]])

##### Fit model and produce outcome

In [36]:
#tune the model
kmeans = KMeans(n_clusters=10)  # Specify the number of clusters
user_level_txns_agg_reduced['cluster'] = kmeans.fit_predict(scaled_data)
user_level_txns_agg_reduced['cluster'].value_counts().sort_index()






cluster
0    13401
1     1013
2     5025
3       67
4     1479
5        1
6     1658
7        2
8      254
9       20
Name: count, dtype: int64

##### Iterate through number of clusters to fine error value and plot

In [39]:
dist_df = []
for num_clusters in range (1,21):
    km = KMeans(n_clusters=num_clusters)
    km.fit(user_level_txns_agg_reduced)
    dist_df.append([num_clusters, km.inertia_])
dist_df = pd.DataFrame(dist_df, columns=["number_of_clusters", "distance"])
fig = px.line(dist_df, x="number_of_clusters", y="distance", title='Error vs Cluster')
fig.show()












































#### Adjust clustering and re run

In [40]:
#tune the model
kmeans = KMeans(n_clusters=4)  # Specify the number of clusters
user_level_txns_agg_reduced['cluster'] = kmeans.fit_predict(scaled_data)
user_level_txns_agg_reduced['cluster'].value_counts().sort_index()






cluster
0    15633
1       37
2     6358
3      892
Name: count, dtype: int64