 # Demo 1 : Loading a Dataset

In [None]:
from sklearn.linear_model import LinearRegression

import numpy as np
import pandas as pd

## Load the data set

SciKit Learn provides many default data sets used widely across the machine learning community

In [None]:
# import data set from scikit learn

from sklearn.datasets import load_boston
boston_ds = load_boston()

## Explore the data set

In [None]:
type(boston_ds)

In [None]:
dir(boston_ds)

In [None]:
print(boston_ds.DESCR)

In [None]:
boston_ds.feature_names

In [None]:
type(boston_ds.feature_names)

In [None]:
type(boston_ds.data)

In [None]:
boston_ds.data.shape

In [None]:
type(boston_ds.target)

In [None]:
boston_ds.target.shape

In [None]:
# load this into a data frame . This is an example of creating a data frame using an ndarray

df_boston_ds = pd.DataFrame(boston_ds.data, columns=boston_ds.feature_names)

In [None]:
df_boston_ds.describe()

In [None]:
df_boston_ds.head()

# Demo 2 : Linear Regression

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html

## Features of the data set

In [None]:
x_features = boston_ds.data
y_target = boston_ds.target

## Choose the right estimator/model object

Since the data is continious and our goal is to estimate the target given the inputs, a linear regression is choosen

In [None]:
# import linear model from scikit library
from sklearn.linear_model import LinearRegression
lineReg = LinearRegression()

## Train & test the model

### Train & Test split

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html 


In [None]:
from sklearn.model_selection import train_test_split
# method provided by sklearn to split the given input into train and test features
x_train,x_test,y_train,y_test = train_test_split(x_features,y_target)

- [ ] **30 sec: find out the types of variables returned from train_test_split method**
- [ ] **10 sec: explore the shapes of the splits**

In [None]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

### Train the model

In [None]:
lineReg.fit(x_train,y_train)

In [None]:
lineReg.intercept_

In [None]:
lineReg.coef_

### Test

In [None]:
y_pred = lineReg.predict(x_test)

In [None]:
lineReg.score(x_test,y_test)

In [None]:
import matplotlib.pyplot as plt

a = plt.scatter(range(y_pred.size),y_pred,c='red')
b = plt.scatter(range(y_test.size),y_test,c='green')
plt.legend((a,b),('Preidcted','Actual'))
plt.show()

In [None]:
corr = df_boston_ds.corr().round(2)

# Model Evaluation

## Selecting the best model

https://jakevdp.github.io/PythonDataScienceHandbook/05.03-hyperparameters-and-model-validation.html

# Demo 3 : Logistic Regression

https://scikit-learn.org/stable/auto_examples/linear_model/plot_logistic.html#sphx-glr-auto-examples-linear-model-plot-logistic-py 

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

## Load the data set

For this example we use iris data set

more on iris data set https://archive.ics.uci.edu/ml/datasets/iris

In [None]:
# import the iris data set

from sklearn.datasets import load_iris
iris_dataset = load_iris()

## Explore the data set

In [None]:
print(iris_dataset.DESCR)

In [None]:
iris_data_frame = pd.DataFrame(iris_dataset.data,columns=iris_dataset.feature_names)

In [None]:
iris_data_frame.head()

In [None]:
iris_dataset.target_names

### Visualize the data !!!

Code from : https://scipy-lectures.org/packages/scikit-learn/auto_examples/plot_iris_scatter.html 

**We will learn about matplotlib in the upcoming session for now concentrate on the scatter plot, don't worry too much about how it is generated.** 



In [None]:
# Load the data
from sklearn.datasets import load_iris
iris = load_iris()

from matplotlib import pyplot as plt

# The indices of the features that we are plotting
x_index = 0
y_index = 1

# this formatter will label the colorbar with the correct target names
formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])

plt.figure(figsize=(5, 4))
plt.scatter(iris.data[:, x_index], iris.data[:, y_index], c=iris.target)
plt.colorbar(ticks=[0, 1, 2], format=formatter)
plt.xlabel(iris.feature_names[x_index])
plt.ylabel(iris.feature_names[y_index])

plt.tight_layout()
plt.show()

In [None]:
iris.target

## Identify the features

In [None]:
x_iris_features = iris_dataset.data
y_iris_target  = iris_dataset.target

In [None]:
iris_dataset.data[0]

- [ ] **30 sec: explore the features**

## Choose the right estimator/model object

Since the data is categorical and our goal is to estimate the category given the inputs, a logisticregression estimator is 
choosen

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
from sklearn.linear_model import LogisticRegression

log_regression = LogisticRegression(multi_class='ovr',solver='lbfgs')

In [None]:
print(log_regression)

## Train

In [None]:
from sklearn.model_selection import train_test_split

# Create train and test splits 

x_iris_train,x_iris_test,y_iris_train,y_iris_test = train_test_split(x_iris_features,y_iris_target)

In [None]:
log_regression.fit(x_iris_train,y_iris_train)

In [None]:
log_regression.coef_

In [None]:
y_iris_pred = log_regression.predict(x_iris_test)

In [None]:
y_iris_pred

In [None]:
y_pred_probability = log_regression.predict_proba(x_iris_test)

In [None]:
y_pred_probability

In [None]:
import matplotlib.pyplot as plt

a = plt.scatter(range(y_iris_pred.size),y_iris_pred,c='red',marker='o',s=100 )
b = plt.scatter(range(y_iris_test.size),y_iris_test,c='green',marker='o', s=25 )
plt.legend((a,b),('Preidcted','Actual'))

In [None]:
log_regression.score(x_iris_test,y_iris_test)

# KNN Classifier

https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html 

## Moving directly to the KNN model as we already explored the iris data set.

In [None]:
# import knn-clasifier from sklearn

from sklearn.neighbors import KNeighborsClassifier
# with n_neighbours = 1 we are selecting the first closest neighbour as the prediction class
knn = KNeighborsClassifier(n_neighbors=1)


## Train

In [None]:
knn.fit(x_iris_train,y_iris_train)

## Test 

In [None]:
knn_pred_y = knn.predict(x_iris_test)

In [None]:
knn_pred_y.shape

In [None]:
knn_pred_y

In [None]:
import matplotlib.pyplot as plt

a = plt.scatter(range(knn_pred_y.size),knn_pred_y,c='red',marker='o',s=100 )
b = plt.scatter(range(y_iris_test.size),y_iris_test,c='green',marker='o', s=25 )
plt.legend((a,b),('Preidcted','Actual'))

In [None]:
knn.score(x_iris_test,y_iris_test)

# K - Means Clustering

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [None]:
# import kmeans from cluster
from sklearn.cluster import KMeans
#import make blobs dataset from sklearn to generate random toy data sets
from sklearn.datasets import make_blobs

In [None]:
n_samples = 300
random_state = 20
x,y = make_blobs(n_samples=300,n_features=2,random_state= None)

In [None]:
type(x)

In [None]:
x.shape

In [None]:
y.shape

In [None]:
import matplotlib.pyplot as plt

plt.scatter(x[:,0],x[:,1])

In [None]:
kmeans_estimator = KMeans()

In [None]:
kmeans_estimator.n_clusters = 2

In [None]:
kmeans_estimator.fit(x)

In [None]:
kmeans_estimator.cluster_centers_

In [None]:
plt.scatter(x[:,0],x[:,1])
plt.scatter(kmeans_estimator.cluster_centers_[0][0],kmeans_estimator.cluster_centers_[0][1],s=50,c='red')
plt.scatter(kmeans_estimator.cluster_centers_[1][0],kmeans_estimator.cluster_centers_[1][1],s=50,c='red')

# uncoment this line when running with n = 3
# plt.scatter(kmeans_estimator.cluster_centers_[2][0],kmeans_estimator.cluster_centers_[2][1],s=50,c='red')

- [ ] **30 sec : re-run kmeans with cluster size as 3** 

# PCA

In [None]:
from sklearn.decomposition import PCA

from sklearn.datasets import make_blobs

import seaborn as sns

## Load data

In [None]:
x,y = make_blobs(n_samples=200,n_features=10,random_state=None)

## Explore the Data

In [None]:
x.shape

In [None]:
y.shape

In [None]:
x_df = pd.DataFrame(x)

In [None]:
x_df.head()

In [None]:
x_df['y'] = y

In [None]:
x_df.head()

In [None]:
corr_matrix = x_df.corr().round(2)

In [None]:
sns.heatmap(data=corr_matrix, annot=True)

## Create the estimator / model

In [None]:
# Create the PCA class

pca = PCA(n_components=5)

## Train

In [None]:
pca.fit(x)

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
first_pca = pca.components_[0]

In [None]:
print(first_pca)

In [None]:
pca.components_

# PCA with Eigen Faces

Reference : https://jakevdp.github.io/PythonDataScienceHandbook/05.09-principal-component-analysis.html 

- [ ] **Homework - Experiment with PCA application using Eigen faces**

In [None]:
from sklearn.datasets import fetch_lfw_people
faces = fetch_lfw_people(min_faces_per_person=60)
print(faces.target_names)
print(faces.images.shape)

In [None]:
import matplotlib.pyplot as plt
plt.imshow(faces.images[3])

# Building a Pipe Line

In [None]:
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

from sklearn.linear_model import LinearRegression

from sklearn.decomposition import PCA

## Chain the estimator

In [None]:
estimator = [('dim_reduction',PCA()),('linear_regression',LinearRegression())]

In [None]:
pipeline_estimator = Pipeline(estimator)

In [None]:
pipeline_estimator

In [None]:
# View the first step 

pipeline_estimator.steps[0]

In [None]:
# view the second step
pipeline_estimator.steps[1]

In [None]:
pipeline_estimator.steps

# Model Persistence

In [None]:
from sklearn.datasets import load_iris
iris_dataset = load_iris()

In [None]:
x_iris_feature = iris_dataset.data
y_iris_target = iris_dataset.target

In [None]:
x_iris_new = [[3,5,4,1],[5,3,4,2]]

In [None]:
from sklearn.linear_model import LogisticRegression
logReg = LogisticRegression(multi_class='ovr',solver='lbfgs')

In [None]:
logReg.fit(x_iris_feature,y_iris_target)

In [None]:
logReg.predict(x_iris_new)

In [None]:
from sklearn.externals import joblib
joblib.dump(logReg,'regression.pkl')

In [None]:
new_regression_estimator = joblib.load('regression.pkl')

In [None]:
type(new_regression_estimator)

In [None]:
new_regression_estimator.predict(x_iris_new)