# Product Segmentation

In [None]:
#!pip install yellowbrick

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
data_file = "beer.csv"

In [None]:
beer_df = pd.read_csv( data_file )

In [None]:
beer_df

In [None]:
sn.scatterplot(data = beer_df,
               x = 'alcohol',
               y = 'calories');

In [None]:
sn.scatterplot(data = beer_df,
               x = 'alcohol',
               y = 'cost');

## Scaling

#### Rescaling the dataset

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaled_beer_df = scaler.fit_transform( beer_df[['calories',
                                                'sodium',
                                                'alcohol',
                                                'cost']] )

In [None]:
from sklearn.cluster import KMeans

cluster_range = range( 1, 10 )
cluster_errors = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters )
  clusters.fit( scaled_beer_df )
  cluster_errors.append( clusters.inertia_ )

plt.figure(figsize=(8,4))
plt.plot( cluster_range, cluster_errors, marker = "o" );

#### Creating 4 Clusters

We will set k to 3 for running *KMeans* algorithm and create a new column *clusterid* in *beer_df* to capture the cluster number it is assigned to.

In [None]:
k = 4

clusters = KMeans( k, random_state = 42 )
clusters.fit( scaled_beer_df )
beer_df["clusterid"] = clusters.labels_

In [None]:
clusters.labels_

## Understanding the clusters

In [None]:
sn.violinplot(data = beer_df,
              y = 'calories',
              x = 'clusterid');

In [None]:
sn.violinplot(data = beer_df,
              y = 'cost',
              x = 'clusterid');

In [None]:
sn.scatterplot(data = beer_df,
               x = 'cost',
               y = 'calories',
               hue = 'clusterid');

#### Cluster 0

In [None]:
beer_df[beer_df.clusterid == 0]

#### Cluster 1

In [None]:
beer_df[beer_df.clusterid == 1]

#### Cluster 2

In [None]:
beer_df[beer_df.clusterid == 2]

#### Cluster 3

In [None]:
beer_df[beer_df.clusterid == 3]

## Understanding number of clusters using Using Dendrogram

In [None]:
cmap = sn.cubehelix_palette(as_cmap=True, rot=-.3, light=1)
sn.clustermap(scaled_beer_df,
              cmap=cmap,
              linewidths=.2,
              figsize = (6,6) );

In [None]:
beer_df.iloc[[9,15]]

In [None]:
beer_df.iloc[[16,14]]

## Evaluating the quality of cluster using Silhouette score

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(2, 2, figsize=(15,8))
num_clusters = [2, 3, 4, 5]
for i, k in enumerate(num_clusters):
    km = KMeans(n_clusters=k,
                random_state=42)
    q, mod = divmod(i, 2)
    visualizer = SilhouetteVisualizer(km,
                                      colors='yellowbrick',
                                      ax=ax[q-1][mod])
    visualizer.fit(scaled_beer_df)

# Evaluating Cluster Performane

## Calinski-Harabasz Index

In [None]:
from sklearn.metrics import calinski_harabasz_score

In [None]:
cluster_range = range( 3, 6 )
ch_scores = []

for num_clusters in cluster_range:
  clusters = KMeans( num_clusters )
  clusters.fit( scaled_beer_df )
  print(f"n_cluster: {num_clusters} - CH Score: {calinski_harabasz_score(scaled_beer_df, clusters.labels_)}" )