In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection  import cross_val_score
from sklearn.model_selection import GridSearchCV

In [None]:
# Load the built-in iris dataset from scikit-learn
iris = load_iris()

In [None]:
n_samples, n_features = iris.data.shape
print("Number of samples:", n_samples)
print("Number of features:", n_features)
# the sepal length, sepal width, petal length and petal width of the first sample (first flower)
print(iris.data[0])

In [None]:
print("Object type:")
print(type(iris.data))
print("\nData type:")
print(iris.data.dtype)
print("\nFirst 5 observations:")
print(iris.data[:5,:])

In [None]:
print(iris.data.shape)
print(iris.target.shape)

In [None]:
print(iris.target)

In [None]:
np.bincount(iris.target)

In [None]:
print(iris.target_names)

In [None]:
print(iris.feature_names)

In [None]:
x_index = 0
colors = ["blue", "red", "green"]

for label, color in zip(range(len(iris.target_names)), colors):
    plt.hist(iris.data[iris.target==label, x_index], 
             label=iris.target_names[label],
             color=color)

plt.xlabel(iris.feature_names[x_index])
plt.legend(loc="upper right")
plt.show()

In [None]:
x_index = 3
y_index = 0

colors = ["blue", "red", "green"]

for label, color in zip(range(len(iris.target_names)), colors):
    plt.scatter(iris.data[iris.target==label, x_index], 
                iris.data[iris.target==label, y_index],
                label=iris.target_names[label],
                c=color)

plt.xlabel(iris.feature_names[x_index])
plt.ylabel(iris.feature_names[y_index])
plt.legend(loc="upper left")
plt.show()

In [None]:
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)

In [None]:
iris_df.head()

In [None]:
iris_df.describe()

In [None]:
target_mapper = {i : s for i, s in enumerate(iris.target_names)}
iris_df["species_name"] = list(map(lambda x: target_mapper[x], iris.target))

In [None]:
iris_df.tail()

In [None]:
iris_df.dtypes

In [None]:
sns.pairplot(iris_df, hue="species_name")

In [None]:
X, y = iris.data, iris.target

In [None]:
print("The features for our first five observations:")
print(X[:5,:])
print("\nThe targets:")
print(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=42)
print("Labels for our training data:")
print(y_train)
print("\nLabels for our testing data:")
print(y_test)

In [None]:
print("All:", np.bincount(y) / float(len(y)) * 100.0)
print("Training:", np.bincount(y_train) / float(len(y_train)) * 100.0)
print("Test:", np.bincount(y_test) / float(len(y_test)) * 100.0)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=y)

print("All:", np.bincount(y) / float(len(y)) * 100.0)
print("Training:", np.bincount(y_train) / float(len(y_train)) * 100.0)
print("Test:", np.bincount(y_test) / float(len(y_test)) * 100.0)

In [None]:
dtc = DecisionTreeClassifier(random_state=42)
dtc

In [None]:
dtc.fit(X_train, y_train)

In [None]:
y_pred_dt = dtc.predict(X_test)
y_pred_dt

In [None]:
print("True labels:")
print(y_test)
print("\nPredicted labels:")
print(y_pred_dt)

In [None]:
print("Test accuracy: ", metrics.accuracy_score(y_test, y_pred_dt))
print("Train accuracy: ", metrics.accuracy_score(y_train, dtc.predict(X_train)))

In [None]:
cnf_matrix = metrics.confusion_matrix(y_test, y_pred_dt, labels=[0, 1, 2]) # TODO: Add labels
cnf_matrix

In [None]:
def plot_confusion_matrix(cnf_matrix, labels="auto"):
    sns.heatmap(cnf_matrix, cmap="coolwarm_r", annot=True, fmt="g",
                linewidths=0.5, xticklabels=labels, yticklabels=labels, cbar=False)
    plt.title("Confusion matrix")
    plt.xlabel("Predicted class")
    plt.ylabel("Actual class")
    plt.show()

In [None]:
plot_confusion_matrix(cnf_matrix, labels=iris.target_names)

In [None]:
rf = RandomForestClassifier(n_estimators=10, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [None]:
print("Test accuracy: ", metrics.accuracy_score(y_test, y_pred_rf))
print("Train accuracy: ", metrics.accuracy_score(y_train, rf.predict(X_train)))

In [None]:
plot_confusion_matrix(metrics.confusion_matrix(y_test, y_pred_rf), labels=iris.target_names)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

In [None]:
print("Test accuracy: ", metrics.accuracy_score(y_test, y_pred_knn))
print("Train accuracy: ", metrics.accuracy_score(y_train, knn.predict(X_train)))

In [None]:
knn

In [None]:
# Create list of n_neighbors to use for our search
k_list = list(range(1, 25))
k_list

In [None]:
# GridSearchCV takes a dictionary mapping the parameter name to a list of parameter settings 
params = {"n_neighbors": k_list}
grid = GridSearchCV(KNeighborsClassifier(),
                    param_grid=params,
                    cv=3, # Run each model specification 3 times to get a better estimate of the score
                    scoring="accuracy",
                    return_train_score=True,
                    iid=False)

In [None]:
grid.fit(X_train, y_train)

In [None]:
pd.DataFrame(grid.cv_results_).sort_values("rank_test_score")

In [None]:
best_knn = grid.best_estimator_
best_knn

In [None]:
best_knn.fit(X_train, y_train)
y_pred_best_knn = best_knn.predict(X_test)

In [None]:
print("Test accuracy: ", metrics.accuracy_score(y_test, y_pred_best_knn))
print("Train accuracy: ", metrics.accuracy_score(y_train, best_knn.predict(X_train)))