# Pycaret Clustering

## Prepare Data

In [None]:
# Prophet model for time series forecast
from prophet import Prophet

# Data processing
import numpy as np
import pandas as pd
import os 

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# clustering 
from pycaret.clustering import *

In [None]:
data_path = "../../../data/raw/Time_Series_Merchants_Transactions_Anonymized.csv"
data = pd.read_csv(data_path)
data.head()

In [None]:
# counting the merchants containing transactions
# with more than 23 zeros out of 26 (months)
zero_count = []
counter = 0
for i in range(len(data)): 
    # Get the count of Zeros in row
    count = data.iloc[i,:].isin([0]).sum()
    zero_count.append(count)
    if(count > 23):
        counter += 1
counter

In [None]:
# removingthe merchants containing transactions
# with more than 23 zeros out of 26 (months) from the dataframe
for i in range(len(data)): 
    if zero_count[i] > 24:
        merchant_id = 'Merchant '+ str(i+1)
        #print(merchant_id)
        data.drop(data.index[data['Merchant Name'] == merchant_id], inplace = True)
data.shape

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# define StandardScaler scaler
scaler = MinMaxScaler()
# transform data
dataset_np = scaler.fit_transform(data.drop(columns='Merchant Name'))

In [None]:
dataset = pd.DataFrame(dataset_np)
dataset.head()

In [None]:
# replacing columns names with standard date format
stddates = pd.date_range(start='2020-08', end='2022-10', freq="M")
dataset.columns = stddates

In [None]:
data = data.drop(columns='Merchant Name')
data.head()

In [None]:
data.columns = range(data.shape[1])   
data.head()

## Setting up Environment in PyCaret

In [None]:
#!pip install scikit-learn==0.23.2

In [None]:
# press enter when the screen hangs to proceed
exp_cluster = setup(data, normalize = True, 
                   session_id = 123)

## Create a Model

### K-means clustering

In [None]:
kmeans = create_model('kmeans')#, num_clusters = 10)

In [None]:
plot_model(kmeans, 'cluster')

In [None]:
plot_model(kmeans, plot = 'elbow')

In [None]:
plot_model(kmeans, plot = 'silhouette')

In [None]:
plot_model(kmeans, plot = 'distribution') #to see size of clusters

In [None]:
kmeans_results = assign_model(kmeans)
#kmeans_results['Cluster'].to_csv('kmeans_clustering.csv')#

### K-modes clustering

In [None]:
kmodes = create_model('kmodes')#, num_clusters = 20)

### Assign a Model

In [None]:
kmode_results = assign_model(kmodes)
kmode_results['Cluster'].head()

In [None]:
#kmode_results['Cluster'].to_csv('kmodes_clustering.csv')#

### Plot a Model

The plot_model() function can be used to analyze different aspects of the clustering model. This function takes a trained model object and returns a plot. 

### Cluster PCA Plot

In [None]:
plot_model(kmodes, 'cluster')

### Elbow Plot

In [None]:
plot_model(kmodes, plot = 'elbow')

### Silhouette Plot

In [None]:
plot_model(kmodes, plot = 'silhouette')

### Distribution Plot

In [None]:
plot_model(kmodes, plot = 'distribution') #to see size of clusters

### mean shift clustering

In [None]:
meanshift = create_model('meanshift')

In [None]:
meanshift_results = assign_model(meanshift)
meanshift_results.head()

In [None]:
plot_model(meanshift, 'cluster')

In [None]:
plot_model(meanshift, plot = 'distribution') #to see size of clusters

### affinity proporgation clustering

In [None]:
ap = create_model('ap')#, num_clusters = 9)

In [None]:
ap_results = assign_model(ap)
#ap_results['Cluster'].to_csv('ap_clustering.csv')#
ap_results.head()

In [None]:
plot_model(ap, 'cluster')

In [None]:
plot_model(ap, plot = 'distribution') #to see size of clusters

#plot_model(ap, plot = 'silhouette')

#plot_model(ap, plot = 'elbow')

### optics clustering

In [None]:
optics = create_model('optics')

In [None]:
optics_results = assign_model(optics)
optics_results.head()

In [None]:
plot_model(optics, 'cluster')

In [None]:
plot_model(optics, plot = 'distribution') #to see size of clusters

In [None]:
optics_results = assign_model(optics)
#optics_results['Cluster'].to_csv('optics_clustering.csv')#

plot_model(optics, plot = 'silhouette')

### birch clustering

In [None]:
birch = create_model('birch')#, num_clusters = 9)

In [None]:
birch_results = assign_model(birch)
#birch_results['Cluster'].to_csv('birch_clustering.csv')
birch_results.head()

In [None]:
plot_model(birch, 'cluster')

In [None]:
plot_model(birch, plot = 'distribution') #to see size of clusters

In [None]:
plot_model(birch, plot = 'silhouette')

### Spectral Clustering

In [None]:
sc = create_model('sc')#, num_clusters = 20)

In [None]:
sc_results = assign_model(sc)
sc_results.head()

In [None]:
plot_model(sc, 'cluster')

In [None]:
plot_model(sc, plot = 'distribution') #to see size of clusters

plot_model(sc, plot = 'silhouette')

plot_model(sc, plot = 'elbow')

### Agglomerative Clustering 

In [None]:
hclust = create_model('hclust')#, num_clusters = 10)

In [None]:
hclust_results = assign_model(hclust)
hclust_results.head()

In [None]:
#hclust_results['Cluster'].to_csv('hclust_clustering.csv')#

In [None]:
plot_model(hclust, 'cluster')

In [None]:
plot_model(hclust, plot = 'distribution') #to see size of clusters

### fuzzy c-means

In [None]:
#!pip install fuzzy-c-means

In [None]:
from fcmeans import FCM
from matplotlib import pyplot as plt

In [None]:
dataset_np = dataset.to_numpy()
fcm = FCM(n_clusters=10)
fcm.fit(dataset_np)

In [None]:
# outputs
fcm_centers = fcm.centers
fcm_labels = fcm.predict(dataset_np)

# plot result
f, axes = plt.subplots(1, 2, figsize=(11,5))
axes[0].scatter(dataset_np[:,0], dataset_np[:,1], alpha=.1)
axes[1].scatter(dataset_np[:,0], dataset_np[:,1], c=fcm_labels, alpha=.1)
axes[1].scatter(fcm_centers[:,0], fcm_centers[:,1], marker="+", s=500, c='w')
plt.show()

In [None]:
fcm_labels

In [None]:
fcm_labels_dataframe = pd.DataFrame(fcm_labels)
column = ['Cluster']
fcm_labels_dataframe.columns = column
#fcm_labels_dataframe.to_csv('fcm_clustering.csv')
fcm_labels_dataframe.head()

In [None]:
#fcm_centers