<img src="res/itm_logo.jpg" width="300px">

## Inteligencia Artificial - IAI84
### Instituto Tecnológico Metropolitano
#### Pedro Atencio Ortiz - 2020

__Agenda__:

<ul>
    <li>Metrics</li>
    <li>Train / Dev Data Splitting</li>
    <li>K-Fold</li>
</ul>

In [None]:
import numpy as np
import time
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import datasets

In [None]:
'''
    Utility functions
'''

def generate_data(data_type):
    """
    Generate a binary dataset with distribution data_type

    Arguments:
    data_type -- distribution of dataset {moons,circles,blobs}

    Returns:
    X -- features
    Y -- labels
    """ 
    np.random.seed(0)
    if data_type == 'moons':
        X, Y = datasets.make_moons(200, noise=0.10)
    elif data_type == 'circles':
        X, Y = sklearn.datasets.make_circles(200, noise=0.1)
    elif data_type == 'blobs':
        X, Y = sklearn.datasets.make_blobs(centers=2, random_state=0)
    return X, Y

def visualize(X, y, model):
    plot_decision_boundary(lambda x:model.predict(x), X, y)

def plot_decision_boundary(pred_func, X, y):
    # Set min and max values and give it some padding
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = 0.01
    # Generate a grid of points with distance h between them
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    # Predict the function value for the whole gid
    Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    # Plot the contour and training examples
    plt.figure(figsize=(5,5))
    plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)
    plt.show()

<hr>

# 1. Metrics

In [None]:
X, Y = generate_data('circles')

In [None]:
color= ['red' if y == 1 else 'green' for y in Y]

plt.figure(figsize=(5,5))
plt.scatter(X[:,0], X[:,1], color=color)

plt.show()

In [None]:
# 1. Entrenamos el dataset completo
model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
model.fit(X,Y)

In [None]:
visualize(X, Y, model)

In [None]:
from sklearn.metrics import accuracy_score, f1_score

preds = model.predict(X)
acc = accuracy_score(Y, preds)
f1 = f1_score(Y, preds)

print("Accuracy over training set: ",acc)
print("f1-score over training set: ",f1)

<hr>

# 2. Data Splitting

In [None]:
# Desordenamos los indices para asegurar que el dataset este' balanceado

perm_index = np.random.permutation(len(X))

In [None]:
# Tomemos el 70% para el entrenamiento y el 30% restante para la evaluacion

train_index = perm_index[:int(0.7*len(X))]
eval_index = perm_index[int(0.7*len(X)):]

# Construimos los dataset de entrenamiento y evaluacion

X_train, Y_train = X[train_index], Y[train_index]
X_eval, Y_eval = X[eval_index], Y[eval_index]

print(Y_train)
print(Y_eval)

In [None]:
# Entrenamos con un dataset y evaluamos con otro

model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
model.fit(X_train,Y_train)

In [None]:
Y_pred = model.predict(X_eval)
acc = accuracy_score(Y_eval, Y_pred)
f1 = f1_score(Y_eval, Y_pred)

print("Accuracy over evaluation set: ",acc)
print("f1-score over evaluation set: ",f1)

In [None]:
visualize(X_train, Y_train, model)

In [None]:
visualize(X_eval, Y_eval, model)

<hr>

# 3. K-Fold

In [None]:
from sklearn.model_selection import KFold

In [None]:
k = 5

kfolds = KFold(n_splits=k)
kfolds.get_n_splits(X)

sum_accuracy = 0

for train_index, test_index in kfolds.split(X):
    # Data splitting
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]
    
    # Model training
    model = KNeighborsClassifier(n_neighbors=3)
    model.fit(X_train, Y_train)
    
    # Prediction and evaluation
    Y_pred = model.predict(X_test)
    sum_accuracy += accuracy_score(y_pred=Y_pred, y_true=Y_test)

In [None]:
print("K-fold accuracy over evaluation dataset: ", sum_accuracy / k)