In [None]:
%matplotlib inline

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets, linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

# import some data to play with
IRIS = datasets.load_iris()

In [None]:
def mesh(X, h=.01):
    """Create a meshgrid object with the input space dimensions."""
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return (xx, yy)

# Index

* [About classification algorithms](#About-classification-algorithms)
* [Train and test subsamples](#Train-and-test-subsamples)
* [K nearest neigbors](#KNN)
* [Decision Trees](#Decision-Trees)
    * [Splitting the dataset](#Splitting-the-dataset)
    * [Avoiding overfitting](#Avoiding-overfitting)
* [Logistic regression](#Logistic-Regression)

## About classification algorithms
Some classification algorithms can only distinguish between two classes, how can we use them in multi class problems? There are two approaches to this:
    
* **One vs one:** is the approach where we evaluate the classes in pairs. Say we have three classes, A, B and C. The OVO ensemble will be composed of 3 (= 3 * (3 - 1) / 2) binary classifiers. The first will discriminante A from B, the second A from C, and the third B from C. At prediction stage, the class that got the highest number of "+1" predictions is our winner. Notice that this is a $O(n^2)$ problem
      
* **One vs rest:** (aka one-vs-all)is the strategy that involves training one classifier (estimator) for class and then taking the one which gives the highest confidence.

[wiki](https://en.wikipedia.org/wiki/Multiclass_classification#Transformation_to_binary)

## Train and test subsamples
In general we should split the data given in two parts: one for training and the other for testing. Usually the testing slice is 1/3 of the dataset.

## KNN
K nearest neigbors is a *lazy* algorithm which does not learn and makes computations in classification time, that is, find a predefined number of training samples (k) closest in distance to the new point, and predict the label from these.

Notice that KNN takes by default the k closest samples regardless how far they are, to mitigate this effect a weight parameter can be added.

KNN can also be applied to time series but they're pretty much regression problems we'll see them in due time. 

**Key features of KNN:**
* Easy to understand and implement.
* Computationally efficient in general (with small datasets).
* Defining similarities.
* The first thing that should be tried when approaching a ML problem.
* They suffer especially the [curse of dimensionality](./../Glossary.ipynb/#C).

In [None]:
### PART #1, Load an preprocess the data #####

# we only take the first two features in the dataset
X = IRIS.data[:, :2]
cols = IRIS['feature_names'][:2]
y = IRIS.target


##### PART 2, create the model #####

# Number of neighbors and weight
k, w = 30, 'distance'

# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors=k, weights=w)
clf.fit(X, y)


##### PART #3, plot the outcome ####

# We are about to create a mesh of points that will represent a bunch of predictions 
xx, yy = mesh(X)
    
# Once created the mesh, drop all the points into the model and predict the values for them
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

# Plot the prediction areas (background)
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
Z = Z.reshape(xx.shape)  # reshape to match the grid, same as yy.shape
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# Plot also the training points (real points)
cmap_bold =  ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, edgecolor='k', s=20)
plt.xlabel(cols[0])
plt.ylabel(cols[1])
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("3-Class classification (k = %i)"% (k))

plt.show()

## Decision Trees
[Nice visualization](http://www.r2d3.us/visual-intro-to-machine-learning-part-1/) 

Decision trees divide the space into high dimensional rectangles. They are simple to understand and interpret (white box model), but they tend to overfit the data. However, they are useful in other ML techniques like bagging or random forests.

In [None]:
# We take the features in pairs (Uncomment to see other pairs)
pair = [0,1]
#pair = [1,2] 
#pair = [2,3] 
X = IRIS.data[:, pair]
y = IRIS.target

# Train
clf = DecisionTreeClassifier().fit(X, y)

# Display the score (in the same training set, notice)
print('score was: {}'.format(clf.score(X, y)))

# Again, create a mesh of points that will represent a bunch of predictions
xx, yy = mesh(X)

# Once created the mesh, drop all the points into the model and predict the values for them
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

plt.xlabel(IRIS.feature_names[pair[0]])
plt.ylabel(IRIS.feature_names[pair[1]])

# Plot the training points
plot_colors, n_classes = 'ryb', 3
for i, color in zip(range(n_classes), plot_colors):
    idx = np.where(y == i)
    plt.scatter(X[idx, 0], X[idx, 1], c=color, label=IRIS.target_names[i], edgecolor='black', s=15)
plt.show()

### Splitting the dataset
Let's split the dataset into training and testing subsamples so we can check how effective is our training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

# Train
clf = DecisionTreeClassifier().fit(X_train, y_train)

# Display the score
print('score was: {}'.format(clf.score(X_test, y_test)))

# Plot the decision boundary
xx, yy = mesh(X)

Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

plt.xlabel(IRIS.feature_names[pair[0]])
plt.ylabel(IRIS.feature_names[pair[1]])

# Plot the training points
for i, color in zip(range(n_classes), plot_colors):
    idx = np.where(y_test == i)
    plt.scatter(X_test[idx, 0], X_test[idx, 1], c=color, label=IRIS.target_names[i], edgecolor='black', s=15)
plt.show()


### Avoiding overfitting
As we can see, decision trees tend to overfit to the trining data, there are two ways to mitigate this:
* Bagging **(B**ootstrap **agg**regat**ing**) [[wiki]](https://en.wikipedia.org/wiki/Bootstrap_aggregating)
* Random Forests [[wiki]](https://en.wikipedia.org/wiki/Random_forest)

**A: Bagging:** take several random subsets of the data, train them independently and finally aggregate the resutls and vote the best one.

**B: Random Forests:** Instead of taking random subsets of the data, we take random subsets of the features.

In [None]:
# Create an artificial dataset
X, y = datasets.make_classification(n_samples=10000, n_features=6,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)

# Now split the train and the test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

# Train the model
clf = RandomForestClassifier(
    n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)  

# Display the score
print('score was: {}'.format(clf.score(X_test, y_test)))


print(clf.feature_importances_)

print(clf.predict([[0, 0, 0, 0, 0, 0]]))

## Logistic Regression

[Regression](https://en.wikipedia.org/wiki/Regression_analysis) is a wide topic in maths and machine learning that tries to estimate an outcome (target) given several independent variables called predictors or features so we can forecast a future output.

The basic idea of regression is the following:


$$\hat{y}(\mathbf{w},\mathbf{x})=w_0+\mathbf{w_1 x_1}+...+\mathbf{w_p y_p}$$


We'll try to predict a $\hat{y}$ by assigning a coeficient ($\mathbf{w}$, weight) to each component (feature) of the vector $\mathbf{X}$ we input and an intercept point (constant term) $w_0$.

That is: we assume that **every target in the data can be approximated by a linear combination of its features.**

In the case of logistic regression, we can plug this line as an argument of the logistic function to get a probability for a certain sample $X$ to be classified as 0 or 1. Odds under 2:2 will be classified as $0$ and $1$ otherwise.   

#### **Example #1) logistic regresion with random samples**
An adaptation from a [[scikit-learn]](https://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic.html#sphx-glr-auto-examples-linear-model-plot-logistic-py) example. 

* **Black dots:** a random sample where values over 0 yield $y=1$ (mostly but not always) and $y=0$ otherwise.

* **Blue line:** a logistic model that predict a certain value. Every prediction for $X>0.222$ ($-w_0/w_1$) will be classified as $1$ otherwise $0$.

* **Red curve:** the probabilty that the prediction will be 1. Notice that the point where probability is $0.5$ is precisely $-w_0/w_1$

There is also a [Desmos graph](https://www.desmos.com/calculator/binjtdtjry) to get a feeling of how those coefficients affect to the final logistic curve.


In [None]:
# Create an array of 300 random samples cetered at 0 
n_samples = 300
np.random.seed(0)
X = np.random.normal(size=n_samples)

# Now, add value 1 to samples over 0 and 0 to samples under 0
y = (X > 0).astype(np.float).ravel()

# Add some noise
X[X > 0] *= 4  # strectch out values over 0

# Be sure to find samples under 0 with y=1 and vice versa
X += .3 * np.random.normal(size=n_samples)  

# Finally, make it a col vector
X = X[:, np.newaxis]

# Plot the point distribution
plt.figure(figsize=(17, 5))
plt.scatter(X, y, color='black')

# Instantiate the classifier
clf = linear_model.LogisticRegression(C=1e7, solver='lbfgs')
clf.fit(X, y)
w0, w1 = clf.intercept_, clf.coef_

def log(x):
    """Get the probability using a logistic function."""
    return 1 / (1 + np.exp(-x))

# Create a bunch of test samples and predict the values and the odds for them.
X_test = np.linspace(-1, 2, n_samples)
y_hat = clf.predict(X_test[:, np.newaxis])
odds = log(w1 * X_test + w0).ravel()

# Plot the outcomes
plt.plot(X_test, odds, color='red', linewidth=3, label='Probability')
plt.plot(X_test, y_hat, color='blue', linewidth=3, label='Prediction (y_hat)')

# Finally, add some more details to the plot
plt.axhline(.5, color='.5') # 2:2 odds
plt.ylabel('y')
plt.xlabel('X')
plt.xlim(-1, 2)
plt.legend( loc="lower right")
plt.show()

#### **Example #2) logistic regresion with iris dataset**
An adaptation from a [[scikit-learn]](https://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html#sphx-glr-auto-examples-linear-model-plot-iris-logistic-py) example. 

In this example, we'll tray to set a linear boundary between the points so blue ones are in blue areas, yellows in yellow and browns in brown.

We can imagine this boundaries as different heights (Z). We can see that blue ones are quite accurate classified whereas the main problem is addressed at brown ones.

The second graph shows how some pair of features are more suitable to predict.

In [None]:
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, 2:4]  # we only take the first two features.
Y = iris.target

# Create an instance of Logistic Regression Classifier and fit the data.
logreg = linear_model.LogisticRegression(
    C=1, solver='lbfgs', multi_class='multinomial')
logreg.fit(X, Y)

# Plot the decision boundary.
xx, yy = mesh(X)

# Drop all the points into the model and predict the values for them
Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(7,7))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
# Multi-comparison between pairs od features.

# Let's take features by couples to compare them
X = (IRIS.data[:, :2], IRIS.data[:, 1:3], IRIS.data[:, 2:4])
Y = IRIS.target

# Create an instance of Logistic Regression Classifier and fit the data.
logreg = linear_model.LogisticRegression(
    C=1e5, solver='lbfgs', multi_class='multinomial')

# Instantiate the plot
_, axs = plt.subplots(1, 3, figsize=(24, 8))

for n, pair in enumerate(X):
    # Fit the model
    clf = logreg.fit(pair, Y)
    
    # Get the boundaries
    xx, yy = mesh(pair)
    
    # Make a bunch of predictions
    y_hat = (clf.predict(np.c_[xx.ravel(), yy.ravel()]))
    Z = (y_hat.reshape(xx.shape))
    
    # Plot the classification boundaries
    axs[n].pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)
    
    # And the training points
    axs[n].scatter(pair[:, 0], pair[:, 1], c=Y, edgecolors='k', cmap=plt.cm.Paired)
    
    # Now set the limits, labels and remove the ticks
    axs[n].set_xlim(xx.min(), xx.max())
    axs[n].set_ylim(yy.min(), yy.max())
    axs[n].set_xlabel(IRIS.feature_names[n])
    axs[n].set_ylabel(IRIS.feature_names[n + 1])
    axs[n].set_xticks(()), axs[n].set_yticks(())
    
plt.show() 