### Importing required packages

In [None]:
import pandas as pd
import pandas_profiling
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
import matplotlib.cm as cm
from sklearn.cluster import MiniBatchKMeans
import pickle

### Importing the csv dataset file to pandas df

In [None]:
data= pd.read_csv(filename)

### Know the dimensions of your data

In [None]:
data.shape

### Creating pandas profile.

##### A pandas profile gives us a statistical overview of the data. Important metrics like missing values, quantile distribution of data, skewness, corelation, std_deviation from the profile report gives us a deeper insight into our data

In [None]:
pandas_profiling.ProfileReport(data)

### Data cleaning

##### Remove features with more than 80% missing values
##### Eliminate outliers 
##### Remove highly corelated features

In [None]:
# Code to remove features that are not required goes here



### Relook at the dimensions of the data

In [None]:
data.shape

### Calculating the skewness of features

In [None]:
data.skew().sort_values()

### Data transformation if the data is highly skewed

#### There are many ways to deal with skewed data. Here I have demonstrated how to use the Quantile transformation to deal with skewed data

In [None]:
data.reset_index(drop=True, inplace=True) 

for i in range (len(data.columns)):
    
    q1=data.iloc[:,i].quantile(0.25)
    q2=data.iloc[:,i].quantile(0.5)
    q3=data.iloc[:,i].quantile(0.75)
    
    if (q3==0):  
        data.iloc[:,i] = data.iloc[:,i].mask(data.iloc[:,i] > 0, 3)
        print(data.iloc[:,i].value_counts())

    elif (q2==0):  
        data.iloc[:,i] = np.where(data.iloc[:,i].between(q2,q3, inclusive=False), 2, data.iloc[:,i])
        data.iloc[:,i] = data.iloc[:,i].mask(data.iloc[:,i] >= q3, 3)
        print(data.iloc[:,i].value_counts())

    elif (q1==0):  
        data.iloc[:,i] = np.where(data.iloc[:,i].between(q1,q2, inclusive=False), 1, data.iloc[:,i])
        data.iloc[:,i] = np.where(data.iloc[:,i].between(q2,q3), 2, data.iloc[:,i])
        data.iloc[:,i] = data.iloc[:,i].mask(data.iloc[:,i] > q3, 3)
        print(data.iloc[:,i].value_counts())

    elif (q1!=0):  
        data.iloc[:,i] = data.iloc[:,i].mask(data.iloc[:,i] <= q1 , 0)
        data.iloc[:,i] = np.where(data.iloc[:,i].between(q1,q2), 1, data.iloc[:,i])
        data.iloc[:,i] = np.where(data.iloc[:,i].between(q2,q3), 2, data.iloc[:,i])
        data.iloc[:,i] = data.iloc[:,i].mask(data.iloc[:,i] > q3, 3)
        print(data.iloc[:,i].value_counts())


### Dimension reduction using PCA

#### K means performance is poor with high number of features. PCA will help us reduce the number of features thus improving the performance of K means

In [None]:
#Scale features if the features have different scales
X_std = StandardScaler().fit_transform(data) 
pca = PCA(n_components=5)
principalComponents = pca.fit_transform(X_std)
# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_, color='blue')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)
PCA_components = pd.DataFrame(principalComponents)

### Elbow method to determine k

#### Select the first n PCA components where the variance drop amongst components is the least

In [None]:
ks = range(1, 10)
inertias = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    model = KMeans(n_clusters=k)
    
    # Fit model to samples
    model.fit(PCA_components.iloc[:,:1])
    
    # Append the inertia to the list of inertias
    inertias.append(model.inertia_)
    
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

### K-means and  accuracy metrics

In [None]:
X = PCA_components.iloc[:,:2]
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
labels=[]
for n_clusters in range_n_clusters:

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    
    labels.append(cluster_labels)
    
    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters                                
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The inertia is :", clusterer.inertia_ )
    print(pd.Series(cluster_labels).value_counts())
    # Compute the silhouette scores for each sample
    

### MiniBatchKmeans and accuracy metrics (If our dataset is very large we use the Mini Batch version of K means)

In [None]:
X = PCA_components.iloc[:,:2]
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]
labels=[]
for n_clusters in range_n_clusters:

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = MiniBatchKMeans(n_clusters=n_clusters, random_state=10, batch_size=100)
    cluster_labels = clusterer.fit_predict(X)
    labels.append(cluster_labels)


    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)

    print("For n_clusters =", n_clusters,
          "The inertia is :", clusterer.inertia_ )
    
    print(pd.Series(cluster_labels).value_counts())
    # Compute the silhouette scores for each sample

### Saving our K means model using Pickle

In [None]:
pickle.dump(clusterer, open("save.pkl", "wb"))