# Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import plotly.graph_objects as go

import os
import warnings

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion

from statsmodels.tsa.holtwinters import ExponentialSmoothing

warnings.filterwarnings("ignore")
pd.set_option('display.float_format', '{:.2f}'.format)

# Download file

In [None]:
file_id = "1DopC7bm_EWX_ocqbOIkME9usE5rG3pZE" # ID of the file on Google Drive
file_name = 'Customer_data_2021&2022.csv'

%run download.ipynb

# Import Data

In [None]:
# File path
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
file_path = os.path.join(parent_dir, 'Data',file_name)

df = pd.read_csv(file_path)
df.head()

# Aggregate

In [None]:
cus_df = df.groupby('BET_ACCOUNT_NUM_HASH', as_index=False) \
            .agg({'AGE': 'max', 
                  'DATE_DIM': 'count',
                  'RACING_TURNOVER' : 'sum', 
                  'FOB_TURNOVER' : 'sum', 
                  'TURNOVER' : 'sum', 
                  'DIVIDENDS_PAID' : 'sum',
                  'GROSS_MARGIN' : 'sum', 
                  'TICKETS' : 'sum'})

# Create ratio columns
cus_df['RACING_TURNOVER_PER'] = cus_df['RACING_TURNOVER'] / cus_df['TURNOVER'] * 100
cus_df['FOB_TURNOVER_PER'] = cus_df['FOB_TURNOVER'] / cus_df['TURNOVER'] * 100
cus_df['DIVIDENDS_PAID_PER'] = cus_df['DIVIDENDS_PAID'] / cus_df['TURNOVER'] * 100
cus_df['GROSS_MARGIN_PER'] = cus_df['GROSS_MARGIN'] / cus_df['TURNOVER'] * 100

# Drop columns
cus_df.drop(['RACING_TURNOVER', 'FOB_TURNOVER', 'DIVIDENDS_PAID', 'GROSS_MARGIN' ], axis=1, inplace=True)

# Impute age
cus_df.fillna(44, inplace=True)

# Define X
X = cus_df.iloc[:, 1:]

# Preview
cus_df.head()

# Clustering without scaling

In [None]:
X = cus_df.iloc[:, 1:]
inertias = []
K = range(1, 16)

for k in K:
    kmeans_model = KMeans(n_clusters=k)
    kmeans_model.fit(X)
    inertias.append(kmeans_model.inertia_)

plt.figure(figsize=(8,6))

plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

# Clustering wtih scaling

In [None]:
X = cus_df.iloc[:, 1:]
inertias = []
silhouettes = []
K = range(1, 16)

for k in K:
    pipeline = Pipeline([
    ("std_scaler", StandardScaler()) ,
    ("kmeans", KMeans(n_clusters=k)),
    ])
    
    pipeline.fit(X)
    inertias.append(pipeline.named_steps["kmeans"].inertia_)

plt.figure(figsize=(8,6))

plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

Choose n_cluster = 7

# PCA to visualize clusters

In [None]:
kmeans_pipeline = Pipeline([
    ("std_scaler", StandardScaler()) ,
    ("kmeans", KMeans(n_clusters=7, random_state=42)),
])

kmeans_pipeline.fit(X)
inertias.append(kmeans_pipeline.named_steps["kmeans"].inertia_)

labels = kmeans_pipeline.predict(X)
labels

### 2D

In [None]:
pca_pipeline = Pipeline([
    ("std_scaler", StandardScaler()) ,
    ("PCA", PCA(n_components=2, random_state=42)),
]) 

pca_pipeline.fit(X)
X_PCA = pca_pipeline.transform(X)

explained_variance_ratio = pca_pipeline.named_steps['PCA'].explained_variance_ratio_
explained_variance_ratio

In [None]:
plt.figure(figsize=(16,10))
plt.scatter(X_PCA[:,0], X_PCA[:,1], c= labels, s=1)
plt.xlim(-3,10)
plt.ylim(-3,20)

In [None]:
pca_pipeline = Pipeline([
    ("std_scaler", StandardScaler()) ,
    ("PCA", PCA(n_components=3, random_state=42)),
]) 

pca_pipeline.fit(X)
X_PCA = pca_pipeline.transform(X)

explained_variance_ratio = pca_pipeline.named_steps['PCA'].explained_variance_ratio_
explained_variance_ratio

In [None]:
fig = go.Figure(data=go.Scatter3d(
    x=X_PCA[:, 0],
    y=X_PCA[:, 1],
    z=X_PCA[:, 2],
    mode='markers',
    marker=dict(
        size=2,
        color=labels,  # Assign color based on z-axis values
        colorscale='Viridis',  # Choose a color scale
        opacity=0.8
    )
))

# Set layout options
fig.update_layout(
    scene=dict(
        xaxis_title='PCA1',
        yaxis_title='PCA2',
        zaxis_title='PCA3'
    ),
    width=1000,
    height=800
)

# Display the plot
fig.show()

# With log transformation

In [None]:
X = cus_df.drop(['BET_ACCOUNT_NUM_HASH','TICKETS'], axis=1)
X

In [None]:
log_transform = FunctionTransformer(np.log1p)

log_cols = ['TURNOVER']
passthrough_cols = [c if c != "TURNOVER" else c for c in X.columns ]

log_pipeline = ColumnTransformer([
    ("log", log_transform, log_cols),
    ("passthrough", "passthrough", passthrough_cols),
])

In [None]:
inertias = []
silhouettes = []
K = range(1, 16)

for k in K:
    pipeline = Pipeline([
        ("log_trans", log_pipeline),
        ("std_scaler", StandardScaler()) ,
        ("kmeans", KMeans(n_clusters=k)),
    ])
    
    pipeline.fit(X)
    inertias.append(pipeline.named_steps["kmeans"].inertia_)

plt.figure(figsize=(8,6))

plt.plot(K, inertias, 'bx-')
plt.xlabel('Values of K')
plt.ylabel('Inertia')
plt.title('The Elbow Method using Inertia')
plt.show()

Choose k=9

In [None]:
kmeans_pipeline = Pipeline([
    ("log_trans", log_pipeline),
    ("std_scaler", StandardScaler()) ,
    ("kmeans", KMeans(n_clusters=7, random_state=42)),
])

kmeans_pipeline.fit(X)
inertias.append(kmeans_pipeline.named_steps["kmeans"].inertia_)

labels = kmeans_pipeline.predict(X)
labels

In [None]:
plt.figure(figsize=(16,10))
plt.scatter(X_PCA[:,0], X_PCA[:,1], c= labels, s=1)
plt.xlim(-3,10)
plt.ylim(-3,20)

In [None]:
pca_pipeline = Pipeline([
    ("std_scaler", StandardScaler()) ,
    ("PCA", PCA(n_components=3, random_state=42)),
]) 

pca_pipeline.fit(X)
X_PCA = pca_pipeline.transform(X)

explained_variance_ratio = pca_pipeline.named_steps['PCA'].explained_variance_ratio_
explained_variance_ratio

In [None]:
fig = go.Figure(data=go.Scatter3d(
    x=X_PCA[:, 0],
    y=X_PCA[:, 1],
    z=X_PCA[:, 2],
    mode='markers',
    marker=dict(
        size=2,
        color=labels,  # Assign color based on z-axis values
        colorscale='Viridis',  # Choose a color scale
        opacity=0.8
    )
))

# Set layout options
fig.update_layout(
    scene=dict(
        xaxis_title='PCA1',
        yaxis_title='PCA2',
        zaxis_title='PCA3'
    ),
    width=1000,
    height=800
)

# Display the plot
fig.show()

In [None]:
X