# Unsupervised learning

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype
#import statsmodels.api as sm
import seaborn as sns
#import pickle

def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn


from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_samples, silhouette_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.utils import resample


# Set up plotting options for seaborn and matplotlib
sns.set_context('notebook') 
sns.set_style('ticks') 
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)

## Dataset
We will be using the Iris dataset in order to demo unsupervised learning methods

In [None]:
iris = sns.load_dataset('iris')
iris.head()

In [None]:
iris['species'].unique()

In [None]:
iris_features = iris.iloc[:,0:3]
iris_target = iris.iloc[:,4]
print(iris_features.head())

In [None]:
_ = sns.scatterplot(x='petal_length',y='sepal_width', data=iris)

> ## Challenge 1
>
> 1. Take a few moments to explore the Iris dataset. What can you learn? Which species do you think will be easier to separate?
> 
> {: .source}
>
> > ## Solution
> > 
> > 1.
> > {: .output}
> {: .solution}
{: .challenge}

## K-means clustering
K-means looks for a fixed number (k) of clusters in a dataset. The K-means algorithm:
- identifies k number of centroids
- allocates every data point to the nearest cluster, while keeping the centroids as small as possible.
- performs iterative (repetitive) calculations to optimize the positions of the centroids

### scale data

In [None]:
iris_features_sc = StandardScaler().fit_transform(iris_features)

### Try 2 clusters

In [None]:
model_kmeans_k2 = KMeans(n_clusters=2, random_state=0)
labels_k2 = model_kmeans_k2.fit_predict(iris_features_sc)

In [None]:
_ = plt.scatter(iris['petal_length'], iris['sepal_width'], c=labels_k2, cmap='viridis', edgecolor='k')

### Try 3 clusters

In [None]:
model_kmeans_k3 = KMeans(n_clusters=3, random_state=0)
labels_k3 = model_kmeans_k3.fit_predict(iris_features_sc)

In [None]:
_ = plt.scatter(iris['petal_length'], iris['sepal_width'], c=labels_k3, cmap='viridis', edgecolor='k')

### Optimal number of clusters
One of the trickier tasks in clustering is identifying the appropriate number of clusters k.

In [None]:
from scipy.spatial.distance import cdist

k_range = np.arange(2,12)
silhouette_averages = []
distortions = []

for i, k in enumerate(k_range):
    model_kmeans = KMeans(n_clusters=k, random_state=2)
    label_kmeans = model_kmeans.fit_predict(iris_features_sc)
    silhouette_avg = silhouette_score(iris_features_sc, label_kmeans)
    print("For %i clusters, the average silhouette_score is %.3f"%(k, silhouette_avg))
    silhouette_averages.append(silhouette_avg)
    distortions.append(sum(np.min(cdist(iris_features_sc, model_kmeans.cluster_centers_, 'euclidean'), axis=1)) / iris_features_sc.shape[0])    

In [None]:
_ = plt.plot(k_range, silhouette_averages)

In [None]:
_ = plt.plot(k_range, distortions)

## Hierarchical Clustering

In [None]:
model_hclust = AgglomerativeClustering()

In [None]:
iris_hc = model_hclust.fit(iris_features)

## Principal Component Analysis (PCA)

In [None]:
iris_pca = PCA(n_components=3).fit_transform(iris_features)

In [None]:
ax = plt.plot(iris_pca[:,0], iris_pca[:,1], label=iris_target, linestyle='')
plt.xlabel('PC1')
plt.ylabel('PC2')

## t-SNE