In [11]:
# Libraries
import warnings
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer, InterclusterDistance
from sklearn.neighbors import NearestNeighbors

# Set-up environment
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_colwidth', None)
sns.set_theme(style="whitegrid", context="paper")
os.chdir('/Users/nataschajademinnitt/Documents/5. Data Analysis/segmenting_customers/')
print("Current directory:", os.getcwd())
warnings.filterwarnings("ignore")

Current directory: /Users/nataschajademinnitt/Documents/5. Data Analysis/segmenting_customers


In [88]:
# Load the data
df = pd.read_csv("./data/processed/df_maintenance_database.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72620 entries, 0 to 72619
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   order_purchase_timestamp  72620 non-null  object 
 1   recent_flag               72620 non-null  int64  
 2   f_returning               72620 non-null  int64  
 3   m_price_log               72620 non-null  float64
 4   s_delivery_diff_binary    72620 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 2.8+ MB


In [90]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler

# 1) Make sure the column is datetime…
df['order_purchase_timestamp'] = pd.to_datetime(df['order_purchase_timestamp'])

# 2) Set it as the index and sort
df = df.set_index('order_purchase_timestamp').sort_index()

# 3) Compute dynamic start/end off that index
end   = df.index.max()                  # now this is a Timestamp
start = end - pd.DateOffset(years=1)

# 4) Label‐based slice
ref_window = df.loc[start:end]

In [92]:
# 5) Proceed as before
FEATURES = ['recent_flag','f_returning','m_price_log','s_delivery_diff_binary']
scaler = MinMaxScaler().fit(ref_window[FEATURES])
X_ref  = scaler.transform(ref_window[FEATURES])
kmeans = KMeans(n_clusters=4, random_state=42).fit(X_ref)


# 4) Train your KMeans on that same window
kmeans     = KMeans(n_clusters=4, random_state=42).fit(X_ref)
labels_ref = kmeans.labels_

In [98]:
from scipy.stats import ks_2samp
from sklearn.metrics import adjusted_rand_score

def evaluate_period(start, end):
    # ensure df is indexed by datetime and sorted:
    #   df = df.set_index('order_purchase_timestamp').sort_index()

    # 1) turn the inputs into Timestamp objects
    start_ts = pd.to_datetime(start)
    end_ts   = pd.to_datetime(end)

    # 2) label‐based slice on your datetime index
    sub = df.loc[start_ts:end_ts]

    # 3) extract & scale
    X_sub = scaler.transform(sub[FEATURES])

    # 4a) Predict with the reference model
    pred_labels = kmeans.predict(X_sub)

    # 4b) Re‑cluster from scratch
    new_labels = KMeans(n_clusters=4, random_state=42).fit_predict(X_sub)

    # 4c) Compute ARI
    ari = adjusted_rand_score(pred_labels, new_labels)

    # 4d) KS‑tests for each feature vs. reference window
    ks_results = {
        feat: ks_2samp(ref_window[feat], sub[feat]).pvalue
        for feat in FEATURES
    }

    return {
        'period': f"{start_ts.date()} to {end_ts.date()}",
        'ARI': ari,
        **{f"KS_{feat}": p for feat, p in ks_results.items()}
    }


In [106]:
res1 = evaluate_period('2017-08-29', '2018-02-28')
res1

{'period': '2017-08-29 to 2018-02-28',
 'ARI': 0.30251297977424085,
 'KS_recent_flag': 0.0,
 'KS_f_returning': 0.9835021407357827,
 'KS_m_price_log': 0.002200170982152518,
 'KS_s_delivery_diff_binary': 0.6553971357169683}

In [108]:
res2 = evaluate_period('2018-03-01', '2018-08-28')
res2

{'period': '2018-03-01 to 2018-08-28',
 'ARI': 0.7675645434707628,
 'KS_recent_flag': 0.0,
 'KS_f_returning': 0.9942307153118107,
 'KS_m_price_log': 0.004732420371023207,
 'KS_s_delivery_diff_binary': 0.46017729378285155}