In [None]:
import numpy as np
import pandas as pd

In [None]:
%%bigquery df

SELECT
#-- select/deselect features that are not required for this use-case.

cust.*,
#-- wls_b.* EXCEPT(billg_acct_num),
fc.* EXCEPT(MOB_BAN),
tac.* EXCEPT(BAN, MSISDN),
usg.* EXCEPT(imsi_num) 
FROM `cto-wln-sa-data-pr-bb5283.customer_personas_features.cust_mapping` as cust
#-- LEFT JOIN `cto-wln-sa-data-pr-bb5283.customer_personas_features.gcp_wls_avg_bill_per_ban` as wls_b on c.BAN = wls_b.billg_acct_num
INNER JOIN `cto-wln-sa-data-pr-bb5283.5G_speed_tiers.usage_per_imsi_5g_plus` as usg on cust.IMSI = usg.imsi_num
LEFT JOIN `cto-wln-sa-data-pr-bb5283.customer_personas_features.gcp_tacs_per_ban_msisdn` as tac on cust.MSISDN = tac.MSISDN and cust.BAN = tac.BAN
LEFT JOIN `cto-wln-sa-data-pr-bb5283.customer_personas_features.gcp_fibre_copper_per_ban_custid` as fc on cust.BAN = fc.MOB_BAN

#-- add filters according to business logic
#WHERE
#cust.CustomerType = 'Consumer'

In [None]:
df.columns

## Data Pre-processing

In [None]:
features_to_drop = ['MSISDN', 'BAN', 'IMSI', 'CustomerType', 'Brand', 'AccountType', 'CUST_ID', 'Province', 'CUST_TYPE_TXT', 'PRIM_PRICE_PLAN_TXT', 'province', 'account_ype', 'brand' , 'whsia_flag', 'customer_type' ]
features = list(set(df.columns) - set(features_to_drop))
cat_cols = ['FibreCopper']

In [None]:
df_dropped = df[features]

In [None]:
df_dropped.info()

In [None]:
df_dropped['FibreCopper'].unique()

In [None]:
df_cat = pd.get_dummies(df_dropped[cat_cols])

In [None]:
df_cat

In [None]:
import scipy.cluster
from sklearn import datasets, cluster

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import calinski_harabasz_score
from sklearn.cluster import FeatureAgglomeration
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import matplotlib.pyplot as plt

In [None]:
from sklearn.decomposition import PCA 

In [None]:
df_dropped

In [None]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_dropped.drop(columns=cat_cols).fillna(0))

In [None]:
df_processed = pd.concat([pd.DataFrame(df_scaled, columns=df_dropped.drop(columns=cat_cols).columns), df_cat], axis=1)

In [None]:
df_processed.describe()

### Feature Agglomeration

In [None]:
len(df_processed.columns)

In [None]:
n=12
agglo = FeatureAgglomeration(n_clusters=n)
agglo.fit(df_processed)
df_processed_reduced = agglo.transform(df_processed)

In [None]:
print(df_processed_reduced.shape, df_processed.shape)

## Clustering

In [None]:
def do_AgglomerativeClustering(X, k):
    agg = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
    agg.fit(X)
    c_h = calinski_harabasz_score(X, agg.labels_)
    sil = silhouette_score(X, agg.labels_)
    meas = [k,c_h,sil]
    #print("K={}, CH={:.2f}, Sil={:.2f}".format(k, c_h, sil))
    return meas

In [None]:
def my_elbow_plot(score_df):
    K=np.array(score_df[['K']])
    C_H=np.array(score_df[['C_H']])
    Sil=np.array(score_df[['Sil']])

    fig, ax1 = plt.subplots()

    color = 'tab:red'
    ax1.set_xlabel('K : # Clusters')
    ax1.set_ylabel('Sil', color=color)
    ax1.plot(K, Sil, color=color)
    ax1.grid(axis='x')
    ax1.tick_params(axis='y', labelcolor=color)
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    color = 'tab:blue'
    ax2.set_ylabel('C_H', color=color)  # we already handled the x-label with ax1
    ax2.plot(K, C_H, color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    fig.tight_layout()  # otherwise the right y-label is slightly clipped
    plt.show()


In [None]:
agg = scipy.cluster.hierarchy.linkage(df_processed_reduced, method='ward', metric='euclidean')

In [None]:
#Evaluation
score_df_aggl=pd.DataFrame()
for k in range(2, 10):
    meas = pd.Series(do_AgglomerativeClustering(agg, k))
    score_df_aggl=score_df_aggl.append(meas,ignore_index=True)
score_df_aggl.columns=['K','C_H','Sil']

my_elbow_plot(score_df_aggl)

In [None]:
# Plot the dendrogram for not reduced data
# plt.figure(figsize=(16, 8));
# plt.grid(False)
# plt.title("Dendrogram for Entire Data");  
# dend = scipy.cluster.hierarchy.dendrogram(agg); 


In [None]:
K = 6
cluster = AgglomerativeClustering(n_clusters=K, affinity='euclidean', linkage='ward')  
cluster.fit_predict(df_processed)

## Results

In [None]:
pca = PCA(2)
pca_data = pd.DataFrame(pca.fit_transform(df_processed_reduced), columns=['PC1', 'PC2'])

In [None]:
plt.figure(figsize=(10, 7))  
plt.scatter(pca_data['PC1'], pca_data['PC2'], c=cluster.labels_)

In [None]:
df_processed

In [None]:
df_orig_cat = pd.DataFrame(df_dropped.fillna(0), columns=df_dropped.columns)

In [None]:
df_orig_cat['labels']=cluster.labels_

In [None]:
df_orig_cat.labels.value_counts()

In [None]:
avg_weekday_cols = [ x for x in df_orig_cat.columns if "weekday" in x and "avg" in x] + ['labels']
avg_weekend_cols = [ x for x in df_orig_cat.columns if "weekend" in x and "avg" in x and "num" not in x] + ['labels']

tot_weekday_cols = [ x for x in df_orig_cat.columns if "weekday" in x and "total" in x] + ['labels']
tot_weekend_cols = [ x for x in df_orig_cat.columns if "weekend" in x and "total" in x and "num" not in x] + ['labels']

In [None]:
cols_list = [avg_weekday_cols, tot_weekday_cols, avg_weekend_cols, tot_weekend_cols ]

Clustering results aren't good because Cluster 0 and 4 have so many customers hence why the average is minimal


In [None]:
np.arange(0, K)

In [None]:

fig,axes=plt.subplots(nrows=4, ncols=3, figsize=(16, 16))
i = 0
for col in cols_list:
    df_orig_cat[sorted(col)].groupby('labels').min().plot.bar(ax=axes[i,0],  legend=False)
    df_orig_cat[sorted(col)].groupby('labels').max().plot.bar(ax=axes[i,1],  legend=False)
    df_orig_cat[sorted(col)].groupby('labels').mean().plot.bar(ax=axes[i,2] )
    
    axes[i,0].set_xticklabels(axes[i,0].get_xticks(), rotation=0)
    axes[i,1].set_xticklabels(axes[i,0].get_xticks(), rotation=0)
    axes[i,2].set_xticklabels(axes[i,0].get_xticks(), rotation=0)
    
    axes[i,0].set(xlabel=None)
    axes[i,1].set(xlabel=None)
    axes[i,2].set(xlabel=None)
    
    axes[i, 2].legend(bbox_to_anchor=(2, 1))
    i+=1
    
axes[0,0].set_title('MIN', fontsize=14)
axes[0,1].set_title('MAX', fontsize=14)
axes[0,2].set_title('MEAN', fontsize=14)

axes[0,0].set_ylabel('AVG WEEKDAY', fontsize=14)
axes[1,0].set_ylabel('TOTAL WEEKDAY', fontsize=14)
axes[2,0].set_ylabel('AVG WEEKEND', fontsize=14)
axes[3,0].set_ylabel('TOTAL WEEKEND', fontsize=14)

axes[3,0].set_xlabel('Cluster #', fontsize=14)
axes[3,1].set_xlabel('Cluster #', fontsize=14)
axes[3,2].set_xlabel('Cluster #', fontsize=14)

In [None]:
df['labels']=cluster.labels_

In [None]:
df.loc[df.labels==2].describe()

In [None]:
df_orig_cat.loc[df_orig_cat.labels==4].describe()

In [None]:
df_orig_cat[sorted(tot_weekend_cols)].groupby('labels').mean()