# Importing The Libraries

In [1]:
# Core Libraries.
import pandas as pd
import numpy as np
import datetime as dt

#Visualization
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
from yellowbrick.cluster import KElbowVisualizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer

#Precison Handling settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 160)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Function for EDA

In [2]:
def data_overview1(df, head=5):
    print(" SHAPE OF DATASET ".center(125,'-'))
    print('Rows: {}'.format(df.shape[0]))
    print('Columns: {}'.format(df.shape[1]))
    print('Shape of Dataset: {}'.format(df.shape))
    
    print("DATA TYPES".center(125,'-'))
    print(df.dtypes.value_counts())
    
    print(" MISSING VALUES ".center(125,'-'))
    print(df.isnull().sum()[df.isnull().sum()>0].sort_values(ascending = False))
    
    print(" DUPLICATED VALUES ".center(125,'-'))
    print(df.duplicated().sum())
    
    print("DATA INFO".center(125,'-'))
    print(df.info())

In [3]:
def data_overview2(df, head=5):
    print(" SHAPE OF DATASET ".center(125,'-'))
    print('Shape of Dataset: {}'.format(df.shape))
    
    print("DATA TYPES".center(125,'-'))
    print(df.dtypes.value_counts())
    
    print("DATA INFO".center(125,'-'))
    print(df.info())

In [4]:
def outlier_thresholds(df, variable):
    quartile1 = df[variable].quantile(0.01)
    quartile3 = df[variable].quantile(0.99)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return up_limit, low_limit

In [5]:
def replace_with_threshold(df, variable):
    up_limit, low_limit = outlier_thresholds(df, variable)
    # df.loc[(df[variable] < low_limit), variable] = low_limit
    df.loc[(df[variable] > up_limit), variable] = up_limit

# Function for Modelling

In [6]:
def compute_silhouette_scores(df, cluster_list=[2, 3, 4, 5]):
    fig, ax = plt.subplots(3, 2, figsize=(15,8))
    for i in cluster_list:
        km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
        q, mod = divmod(i, 2)
        visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
        visualizer.fit(df)
        silhouette_avg = silhouette_score(df, km.labels_)
        print("For n_clusters =", i, "The average silhouette_score is :", silhouette_avg)

In [7]:
def plot_clusters(one, two, three, four, kmeans):
    # Checking the quality of clustering in the data set
    plt.figure(figsize=(10,5))
    ax = plt.axes()
    plt.scatter(one["Recency"],one["Frequency"],color='green')
    plt.scatter(two["Recency"],two["Frequency"],color='red')
    plt.scatter(three["Recency"],three["Frequency"],color='grey')
    plt.scatter(four["Recency"],four["Frequency"],color='blue')
    plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],color="yellow",marker="*",label="centroid")
    plt.legend()
    plt.show()

In [8]:
def plot_metrics_vs_clusters(df, lists):
    sns.set(rc={"figure.figsize":(8, 4)})
    
    for metric in lists:
        sns.boxplot(x="Clusters", y=metric, data=df).set_title(f"{metric} v/s Clusters")
        plt.show()

In [9]:
metrics = ['euclidean', 'manhattan', 'cosine', 'chebyshev', 'minkowski', 'sqeuclidean', 'seuclidean', 'mahalanobis', 'correlation']

def print_silhouette_scores(data, labels, metrics):
    for metric in metrics:
        score = silhouette_score(data, labels, metric=metric)
        print(f"Silhouette score with {metric} metric: {score}")

In [10]:
def plot_segment_counts(clusters, titles):
    sns.set(rc={"figure.figsize":(20, 4)})
    for i in range(0, len(clusters), 2):
        fig, ax = plt.subplots(1, 2)
        sns.countplot(x="Segment", data=clusters[i], ax=ax[0]).set_title(titles[i])
        if i+1 < len(clusters):
            sns.countplot(x="Segment", data=clusters[i+1], ax=ax[1]).set_title(titles[i+1])
        fig.show()