### Non-hierarchical clustering (kmeans): WholeSale data  

#### Import libraries  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

#### Parameters  

In [None]:
csv_in = '../ai-0102/pandas_training-utf8.csv'

# To show more rows and columns
pd.options.display.max_rows = 999 
pd.options.display.max_columns = 999 

#### Read CSV file  

In [None]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=13, header=0)
print(df.shape)
print(df.info())
display(df.head())
display(df.describe())

#### Separate data  

In [None]:
df_data = df.loc[:, 'FRESH':]
print(df_data.shape)
display(df_data.head())

#### Standardization  

In [None]:
sc = StandardScaler()
X_std = sc.fit_transform(df_data)

#### Elbow method  

In [None]:
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i)
    km.fit(X_std)
    distortions.append(km.inertia_)
plt.plot(range(1, 11), distortions, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Distortion')
plt.show()

#### Execute clustering  

In [None]:
n_cls = 3
km = KMeans(n_clusters=n_cls, random_state=7)
cls = km.fit_predict(X_std)

#### Add results of clustering into df_data  

In [None]:
df_data['kmeans'] = cls

#### Check number of members of each cluster  

In [None]:
print(df_data['kmeans'].value_counts())

#### Calculate average of each category  

In [None]:
df_ave = df_data.groupby('kmeans').mean()
display(df_ave)

#### Draw stacked bar plot  

In [None]:
height = np.zeros(n_cls)
xx = list(df_ave.index)
plt.xticks(xx, df_ave.index)
for i in range(df_ave.shape[1]):
    c = df_ave.iloc[:, i]  # column i
    plt.bar(xx, c, bottom=height, label=c.name)
    height+=c
plt.xlabel('Cluster No.')
plt.ylabel('Average of Annual Spending (m.u.)')
plt.legend()
plt.show()