# IRIS dataset: ML: Classification 101

In [1]:
# import dataset
from sklearn.datasets import load_iris
# splitting the dataset 
from sklearn.cross_validation import train_test_split
# pandas for data wrangling and visualization
import pandas as pd, numpy as np

In [2]:
# loading data to pandas dataframe
iris = load_iris()
X_col = ['sepal length in cm','sepal width in cm','petal length in cm','petal width in cm']
y_col = ['Prediction']
X = pd.DataFrame(iris.data, columns=X_col)
y = pd.DataFrame(iris.target, columns=y_col)
print(X.describe())
print(X.shape, y.shape)

       sepal length in cm  sepal width in cm  petal length in cm  \
count          150.000000         150.000000          150.000000   
mean             5.843333           3.054000            3.758667   
std              0.828066           0.433594            1.764420   
min              4.300000           2.000000            1.000000   
25%              5.100000           2.800000            1.600000   
50%              5.800000           3.000000            4.350000   
75%              6.400000           3.300000            5.100000   
max              7.900000           4.400000            6.900000   

       petal width in cm  
count         150.000000  
mean            1.198667  
std             0.763161  
min             0.100000  
25%             0.300000  
50%             1.300000  
75%             1.800000  
max             2.500000  
((150, 4), (150, 1))


In [3]:
# cross validation split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
y_train = np.ravel(y_train) # converting it to 1d vector array
y_test = np.ravel(y_test) # converting it to 1d vector array
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((120, 4), (30, 4), (120,), (30,))


In [4]:
# calling various classifier algorithms
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier

## Running Multiple classifiers via for loop

In [5]:
clfs = [DecisionTreeClassifier(max_depth=10, min_samples_leaf=1),LogisticRegression(n_jobs=-1), \
        SVC(),GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0), \
        KNeighborsClassifier(n_jobs=-1),GaussianNB()]
for clf in clfs:
    clf.fit(X_train, y_train)
    accuracy = clf.score(X_test, y_test)*100
    print('Accuracy of %r = %2f' % (clf,accuracy) + ' %\n')

Accuracy of DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best') = 93.333333 %

Accuracy of LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) = 93.333333 %

Accuracy of SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) = 96.666667 %

Accuracy of GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=1, max_features=None, max_leaf_nodes=None,
    