In [7]:
import pandas as pd
import numpy as np
iris = pd.read_csv('iris.csv')

x = iris.iloc[:, 0:4]
y = iris.iloc[:, 4]

Machine Learning commonly works with binary classification. Let's see how to make a binary y for the setosa class.

In [8]:
y_binary = pd.get_dummies(y)
y_setosa = y_binary.iloc[:,0]

Feature Scaling/Normalization from 0 to 1. There is standard scaler, normalizer, and minmaxscaler.

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
print(x_scaled)


[[0.22222222 0.625      0.06779661 0.04166667]
 [0.16666667 0.41666667 0.06779661 0.04166667]
 [0.11111111 0.5        0.05084746 0.04166667]
 [0.08333333 0.45833333 0.08474576 0.04166667]
 [0.19444444 0.66666667 0.06779661 0.04166667]
 [0.30555556 0.79166667 0.11864407 0.125     ]
 [0.08333333 0.58333333 0.06779661 0.08333333]
 [0.19444444 0.58333333 0.08474576 0.04166667]
 [0.02777778 0.375      0.06779661 0.04166667]
 [0.16666667 0.45833333 0.08474576 0.        ]
 [0.30555556 0.70833333 0.08474576 0.04166667]
 [0.13888889 0.58333333 0.10169492 0.04166667]
 [0.13888889 0.41666667 0.06779661 0.        ]
 [0.         0.41666667 0.01694915 0.        ]
 [0.41666667 0.83333333 0.03389831 0.04166667]
 [0.38888889 1.         0.08474576 0.125     ]
 [0.30555556 0.79166667 0.05084746 0.125     ]
 [0.22222222 0.625      0.06779661 0.08333333]
 [0.38888889 0.75       0.11864407 0.08333333]
 [0.22222222 0.75       0.08474576 0.08333333]
 [0.30555556 0.58333333 0.11864407 0.04166667]
 [0.22222222 

In [10]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x_scaled, y_setosa, test_size=0.2, random_state=0)

Building a random forest model

In [12]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

Building a support vector machine model

In [14]:
from sklearn.svm import SVC
sv_clf = SVC(kernel='linear')

Building a K Nearest Neighbors Model

In [17]:
from sklearn import neighbors
knn_clf = neighbors.KNeighborsClassifier(n_neighbors=5)

One way to validate the model is through the test set.

In [18]:
from sklearn import metrics
rf_clf.fit(x_train, y_train)
rf_pred = rf_clf.predict(x_test)
print(metrics.accuracy_score(y_test, rf_pred))

sv_clf.fit(x_train, y_train)
sv_pred = sv_clf.predict(x_test) #Predict class labels for samples in X_test.
print(metrics.accuracy_score(y_test, sv_pred))

knn_clf.fit(x_train, y_train)
knn_pred = knn_clf.predict(x_test) #Predict class labels for samples in X_test.
print(metrics.accuracy_score(y_test, knn_pred))

1.0
1.0
1.0


Another way is cross fold validation. Either cross_val_score or cross_val_predict.
Cross validation is useful for choosing a model and its hyperparameters.

In [21]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(sv_clf, x_test, y_test, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 1.00 (+/- 0.00)


Imbalanced Data

Let's say we have an imbalanced data set with 5% setosas and 95% non-setosas.
Our model can classify everything as a non-setosa and achieve 95% accuracy.
How can we fix this?

Increase setosa examples through resample with replacement.

Decrease non-setosa examples through resample without replacement.

Examine ROC curve instead of accuracy.

Different class weights in model.

In [64]:
from sklearn.utils import resample
minority = iris[iris.species=="setosa"]  #50 entries
majority = iris[iris.species != "setosa"]  #100 entries


In [75]:
#increase minority to 100 by sampling 50 more entries
upsampled = resample(minority, 
                     replace=True,     
                    n_samples=50,    
                     random_state=0)
minority_100 = pd.concat([minority,upsampled])

In [76]:
#or decrease majority to 50 by selecting 50 entries
majority_50 = resample(majority, 
                    replace=False,    # sample without replacement
                    n_samples=50,     # to match minority class
                    random_state=0)

In [77]:
#select balanced class weights in model
from sklearn.svm import SVC
sv_clf = SVC(kernel='linear', class_weight = "balanced")

In [78]:
#use roc auc score
from sklearn.metrics import roc_auc_score
knn_clf.fit(x_train, y_train)
y_prob = knn_clf.predict_proba(x_test)
print( roc_auc_score(y_test, y_prob[:,1]))

1.0
