In [None]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Exploração dos dados

In [None]:
iris = load_iris()
type(iris)

In [None]:
type(iris.data)

In [None]:
iris.data[:5,]

In [None]:
iris.data.shape

In [None]:
iris.feature_names

In [None]:
pd.DataFrame(iris.data).head()

In [None]:
print(iris.feature_names)
features = [i.replace(' ','_')[:-5] for i in iris.feature_names] 
features

In [None]:
df = pd.DataFrame(iris.data, columns=features)
df.head()

In [None]:
iris.target

In [None]:
iris.target_names

In [None]:
species = [iris.target_names[i] for i in iris.target]
species

In [None]:
df['species'] = species
df.head()

In [None]:
df.groupby('species').describe().T

In [None]:
df.groupby('species')['sepal_length', 'sepal_width', 'petal_length', 'petal_width'].mean()

In [None]:
df.groupby('species').agg([np.min, np.max])

## Modelo KNN

In [None]:
X= df.drop('species', axis = 1)
y = df['species'] #recall axis 1 means columns

print(X.shape)
print (y.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier 

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)   

In [None]:
knn.fit(X, y)    

In [None]:
knn.score(X, y)

In [None]:
novo_dado = np.array([6.3,3.1,5.1,2.4]).reshape(1, 4)

In [None]:
knn.predict(novo_dado)

In [None]:
X_novo = [[6.3,3.1,5.1,2.4], [3, 5, 2, 2]]
knn.predict(X_novo)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
print(knn.predict(X_novo))       
print(knn.predict_proba(X_novo)) 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X, y)
knn.score(X, y)

In [None]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()

# standard scalr can only take in numpy arrays, no dataframes
X_scaled = s.fit_transform(X[['petal_length', 'petal_width']].values)

In [None]:
X_scaled

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_scaled, y)
knn.score(X_scaled, y)

In [None]:
scores=[]
for k in range(3,21,2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_scaled, y)
    scores.append(knn.score(X_scaled, y))

In [None]:
scores

In [None]:
plt.plot(range(3,21,2),scores)
plt.ylabel("Accuracy")
plt.xlabel("K")

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X, y)
knn.score(X, y)

## Avaliação de modelo

### Conjunto de treino e testes

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=3)

In [None]:
print(X_train.shape)
print(y_train.shape)

In [None]:
print(X_test.shape)
print(y_test.shape)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)   
knn.score(X_test, y_test)      

### Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
knn = KNeighborsClassifier(n_neighbors=1)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')

In [None]:
print(scores)
np.mean(scores) 

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
scores
np.mean(scores)

In [None]:
k_range = range(1, 30, 2)
scores = []
for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores.append(np.mean(cross_val_score(knn, X, y, cv=5, scoring='accuracy')))
scores

In [None]:
plt.figure()
plt.plot(k_range, scores)

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
k_range = range(1, 30, 2)
param_grid = dict(n_neighbors=k_range)
grid = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_ )

## Arvore de Decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier
treeclf = DecisionTreeClassifier()
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
print(scores.mean())

In [None]:
treeclf.fit(X_train, y_train)
y_pred = treeclf.predict(X_test)
y_pred

In [None]:
(y_pred == y_test).mean()

In [None]:
from sklearn.tree import export_graphviz
export_graphviz(treeclf, out_file='test.dot', feature_names=iris.feature_names)

In [None]:
param_grid = {'max_depth': range(1,21), 'max_features': range(1,5), 'min_samples_leaf': range(1,6)}
grid = GridSearchCV(treeclf, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_ )

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
scores = cross_val_score(nb, X, y, cv = 5, scoring='accuracy')
scores.mean()

## Pipeline

In [None]:
from sklearn.pipeline import Pipeline
scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=5)
pipe = Pipeline([('scaler', scaler), ('knn', knn)])
mean = cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()
mean

In [None]:
k_range = range(1, 30, 2)
param_grid = dict(knn__n_neighbors=k_range)
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid.fit(X, y)

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_ )