In [None]:
# Set default matplotlib figure size
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]

# current version of seaborn generates a bunch of warnings that we'll ignore
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
from pandas import DataFrame, read_excel
from sklearn import preprocessing
from sklearn.cluster import KMeans

# Distinguish Iris species based on flower morphological features

<img align="right" src="../data/iris_petal_sepal.png">
The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher.

The data set consists of 50 samples from each of three species of Iris (*Iris setosa*, *Iris virginica* and *Iris versicolor*). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters.

Toolkit: 
  * **Seaborn, statistical data visualization [docs](https://seaborn.pydata.org/api.html)**
  * **Scikit-learn, machine learning in python [docs](http://scikit-learn.org/stable/documentation.html)**

In [None]:
df = read_excel('../data/iris-dataset.xls')

In [None]:
f1 = sns.pairplot(df, hue="species")

In [None]:
# the data is unbalanced (eg sepallength ~4x petalwidth), so should do feature scaling,
# otherwise the larger features will dominate the others in clustering, etc.

scaler = preprocessing.StandardScaler()

features = df.iloc[:,0:4]
scaler.fit(features)
features_scaled_array = scaler.transform(features)
features_scaled = DataFrame(features_scaled_array, columns = features.columns)

## Species identification by clustering: k-means

In [None]:
estimator = KMeans(n_clusters=3)
estimator.fit(features_scaled_array)
df['kmeans'] = estimator.labels_

df.groupby(['species','kmeans']).count()

## Species identification using Principle Component Analysis

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
Y_sklearn = pca.fit_transform(features_scaled_array)

print('explained variance ratio (first two components): {0}'.format(pca.explained_variance_ratio_))
print(DataFrame(pca.components_, columns=features.columns, index=['PC-1','PC-2']))

with plt.style.context('seaborn-whitegrid'):
    for lab, col in zip(df['species'].unique(),
                        ('blue', 'red', 'green')):
        plt.scatter(Y_sklearn[df['species']==lab, 0],
                    Y_sklearn[df['species']==lab, 1],
                    label=lab,
                    c=col)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.legend(loc='lower right')
    plt.show()

In [None]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(features, df['species'])

In [None]:
import graphviz
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=features.columns,  
                         class_names=df['species'].unique(),  
                         filled=True, rounded=True,  
                         special_characters=True)  
graphviz.Source(dot_data)