### SIX - Time Series Prediction & Outlier Detection on Anonymized Transaction Dataset
-- Notebook by Mukund Pondkule

### Packages

In [None]:
# ----------------------------------GENERAL---------dropna---------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import matplotlib.mlab as mlab
import matplotlib.cm as cm
warnings.filterwarnings("ignore")

# ----------------------------------VISUALIZATION---------------------------

from IPython.display import (
    Image,
)  # I-Python For allowing user to display images in notebooks
import seaborn as sns  # Visualization library based on matplotlib. It provides a high-level interface for drawing attractive and informative statistical graphics
import matplotlib.pyplot as plt  # Importation of pyplot sub-library from Matplotlib library

#----------------------------------CLUSTERING-------------------------------

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.pipeline import Pipeline

#from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs

### Custom classes and functions

In [None]:
import sys

SCRIPT_DIR = "../src"
sys.path.append(SCRIPT_DIR)

### Global Parameters Setting

In [None]:
plt.style.use("seaborn")

### User-Dependent Variables

In [None]:
# you need to change the file path
data_path = "../../../data/raw/Time_Series_Merchants_Transactions_Anonymized.csv"
df_merchant_transactions = pd.read_csv(data_path)

In [None]:
df_merchant_transactions.head()

In [None]:
df_merchant_transactions.info()

In [None]:
import missingno as msno

msno.matrix(df_merchant_transactions.sort_values("Merchant Name", ascending=True))

In [None]:
zero_count = []
merchant = []
for i in range(len(df_merchant_transactions)): 
    # Get the count of Zeros in row
    count = df_merchant_transactions.iloc[i,:].isin([0]).sum()
    #if(count == 26):
    zero_count.append(count)
    merchant.append(df_merchant_transactions.iloc[i,0])

In [None]:
y_pos = np.arange(len(merchant))
# Create bars
plt.bar(y_pos, zero_count)
# Create names on the x-axis
plt.xticks(y_pos, merchant)
# Show graphic
plt.show()

In [None]:
num_bins = 26
n, bins, patches = plt.hist(zero_count, num_bins, facecolor='blue', alpha=0.5)

# Giving x label using xlabel() method
# with bold setting
plt.xlabel("monthly zeros", fontweight='bold')
  
# Giving y label using xlabel() method
# with bold setting
plt.ylabel("zeros count", fontweight='bold')
  
# Giving title to the plot
plt.title("monthly zeros vs zeros count")
plt.show()

#### Data preparation

In [None]:
df_merchant_transactions = df_merchant_transactions.drop(columns='Merchant Name')

In [None]:
df_merchant_transactions.head()

In [None]:
X, y_true = make_blobs(n_samples=1613, centers=3 , n_features=26,
                       cluster_std=1.5, random_state=42)

In [None]:
plt.scatter(X[:, 0], X[:, 1], s=50);

In [None]:
sns.set (rc = {'figure.figsize':(40, 40)})
sns.heatmap(df_merchant_transactions.corr(),annot=True,)

### using clustering method

#### Method 1: Choosing the optimal K in K-means: The Elbow Sum-of-Squares Method

In [None]:
cluster_errors = []

for n_clusters in range(2, 11):
    kmeans_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("cluster", KMeans(n_clusters=n_clusters, random_state=17, verbose=0))])
    kmeans_pipeline.fit(df_merchant_transactions)
    kmeans_pipeline.predict(df_merchant_transactions)
    cluster_errors.append(kmeans_pipeline.named_steps["cluster"].inertia_)

In [None]:
plt.plot(range(2, 11),cluster_errors, "o-")
plt.xlabel("No. Clusters")
plt.ylabel("SSE")
plt.show()

#### Method 2: Choosing the optimal K in K-means: The Silhouette Method

In [None]:
silhouette_s = []

for n_clusters in range(2, 11):
    kmeans_pipeline = Pipeline([
        ("scaler", StandardScaler()),
        ("cluster", KMeans(n_clusters=n_clusters))])

    cluster_labels = kmeans_pipeline.fit_predict(df_merchant_transactions)
    print("For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_score(df_merchant_transactions, cluster_labels).round(4))
    silhouette_s.append(silhouette_score(df_merchant_transactions, cluster_labels))

plt.plot(range(2, 11),silhouette_s, "o-")
plt.xlabel("No. Clusters")
plt.ylabel("Silhouette Score")
plt.show()

### Alglomerative clustering

In [None]:
from sklearn.cluster import AgglomerativeClustering

silhouette_s = []

for n_clusters in range(2, 11):
    agglo_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("cluster", AgglomerativeClustering(n_clusters=n_clusters))])

    cluster_labels = agglo_pipeline.fit_predict(df_merchant_transactions)
    print("For n_clusters =", n_clusters,"The average silhouette_score is :", silhouette_score(df_merchant_transactions, cluster_labels).round(4))
    silhouette_s.append(silhouette_score(df_merchant_transactions, cluster_labels))

plt.plot(range(2, 11),silhouette_s, "o-")
plt.xlabel("No. Clusters")
plt.ylabel("Silhouette Score")
plt.show()

In [None]:
X = df_merchant_transactions.to_numpy()

In [None]:
''' Note: The code below is directly took from sklearn documentation of silhoutte analysis.
          And so you do not have to spend time trying to understand the code'''

range_n_clusters = range(2, 10)

r_seed = 17

cols = df_merchant_transactions.columns

for n_clusters in range_n_clusters:

    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot

    # The silhouette coefficient can range from -1, 1 but in this example all lie within [-0.2, 1]
    ax1.set_xlim([-0.2, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

    # Initialize the Pipeline with n_clusters value and a random generator seed for reproducibility.
    kmeans_pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("cluster", KMeans(n_clusters=n_clusters, random_state=r_seed, verbose=0))])

    cluster_labels = kmeans_pipeline.fit_predict(X)
    print('cluster_labels ', cluster_labels,' ', len(cluster_labels))
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =",
          n_clusters,
          "The average silhouette_score is :",
          silhouette_avg.round(4))

    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(X, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i + 1) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0,
                          ith_cluster_silhouette_values,
                          facecolor=color,
                          edgecolor=color,
                          alpha=0.7)

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral((cluster_labels.astype(float) + 1) / n_clusters)
    ax2.scatter(X[:, 0],
                X[:, 1],
                marker=".",
                s=30,
                lw=0,
                alpha=0.7,
                c=colors,
                edgecolor="k")

    # Labeling the clusters
    pipeline_centers = kmeans_pipeline.named_steps["cluster"].cluster_centers_
    centers = kmeans_pipeline.named_steps["scaler"].inverse_transform(pipeline_centers)
    # Draw white circles at cluster centers
    ax2.scatter(
        centers[:, 0],
        centers[:, 1],
        marker="o",
        c="white",
        alpha=1,
        s=200,
        edgecolor="k",
    )

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], alpha=1, s=50, edgecolor="k")

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature : " + cols[0])
    ax2.set_ylabel("Feature space for the 2nd feature :" + cols[1])
    
    plt.suptitle(
        (
            "Silhouette analysis for KMeans clustering on sample data "
            "with n_clusters = %d" % n_clusters
        ),
        fontsize=14,
        fontweight="bold",
    )
    
print("\n")
plt.show()

### Clustering with the best model

In [None]:
kmeans_pipeline = Pipeline([
        #("scaler", StandardScaler()),
        ("cluster", KMeans(n_clusters=3, random_state=17, verbose=0))])
kmeans_pipeline.fit(df_merchant_transactions)
clusters = kmeans_pipeline.predict(df_merchant_transactions)
df_merchant_transactions['merchant_clusters'] = clusters
df_merchant_transactions.head()

In [None]:
df_merchant_transactions['merchant_clusters'].info()

In [None]:
df_merchant_transactions['merchant_clusters'].value_counts()

In [None]:
sns.scatterplot(x=df_merchant_transactions['Aug 20'], y=df_merchant_transactions['Aug 20'], hue=df_merchant_transactions['merchant_clusters']);

### 𝑘-means clustering for time series

In [None]:
df_merchant_transactions.head()

In [None]:
df_merchant_transactions_tslearn = df_merchant_transactions.copy()
df_merchant_transactions_tslearn.head()

In [None]:
df_merchant_transactions_tslearn = df_merchant_transactions_tslearn.drop(columns='merchant_clusters')

In [None]:
# define StandardScaler scaler
scaler = MinMaxScaler()
# transform data
X_series = scaler.fit_transform(df_merchant_transactions_tslearn)

In [None]:
from tslearn.utils import to_time_series_dataset
X_tslearn = to_time_series_dataset(X_series)
type(X_tslearn)

In [None]:
from tslearn.clustering import TimeSeriesKMeans, silhouette_score

km = TimeSeriesKMeans(n_clusters=3, metric="dtw")
labels = km.fit_predict(X_tslearn)
silhouette_score(X_tslearn, labels, metric="dtw")

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer
from tslearn.clustering import TimeSeriesKMeans

for c in range(2,4):
    model = TimeSeriesKMeans(c, metric="dtw", random_state=r_seed, verbose=0)
    visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer.fit(X_series)