In [123]:
import numpy as np
import pandas as pd


In [124]:
data = pd.read_csv('irisdata.csv', header=None)

In [125]:
data.shape

(150, 5)

In [126]:
data.columns = ['splength','spwdith','ptlength','ptwidth','name']

In [127]:
data.head()

Unnamed: 0,splength,spwdith,ptlength,ptwidth,name
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [128]:
def extractinputoutput(data, colname):
    outputdata = data[colname]
    inputdata = data.drop(colname, axis=1)
    return inputdata,outputdata

x, y = extractinputoutput(data, colname ='name')

In [129]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=123)

In [130]:
y_test

72     Iris-versicolor
112     Iris-virginica
132     Iris-virginica
88     Iris-versicolor
37         Iris-setosa
138     Iris-virginica
87     Iris-versicolor
42         Iris-setosa
8          Iris-setosa
90     Iris-versicolor
141     Iris-virginica
33         Iris-setosa
59     Iris-versicolor
116     Iris-virginica
135     Iris-virginica
104     Iris-virginica
36         Iris-setosa
13         Iris-setosa
63     Iris-versicolor
45         Iris-setosa
28         Iris-setosa
133     Iris-virginica
24         Iris-setosa
127     Iris-virginica
46         Iris-setosa
20         Iris-setosa
31         Iris-setosa
121     Iris-virginica
117     Iris-virginica
4          Iris-setosa
Name: name, dtype: object

In [131]:
x_train.isnull().sum()

splength    0
spwdith     0
ptlength    0
ptwidth     0
dtype: int64

In [132]:
from sklearn.preprocessing import StandardScaler

def standardizer(data):
    data_columns = data.columns  # agar nama column tidak hilang
    data_index = data.index # agar index tidak hilang
    normalize = StandardScaler()
    normalize.fit(data)
    
    normalized_data = pd.DataFrame(normalize.transform(data), index = data_index)
    normalized_data.columns = data_columns
    return normalized_data, normalize

x_train, normalize = standardizer(x_train)


In [133]:
x_train.head()

Unnamed: 0,splength,spwdith,ptlength,ptwidth
130,1.891072,-0.54903,1.323848,0.915092
119,0.161621,-1.916855,0.684916,0.374151
29,-1.444297,0.362854,-1.289966,-1.383908
0,-0.950168,1.046766,-1.406135,-1.383908
62,0.161621,-1.916855,0.104069,-0.302026


In [134]:
y_train.value_counts(normalize=True)

Iris-versicolor    0.366667
Iris-virginica     0.325000
Iris-setosa        0.308333
Name: name, dtype: float64

In [135]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, log_loss
from sklearn.externals import joblib

In [136]:
def knn_fit(x_train, y_train, scoring = 'accuracy'):
    
    knn = KNeighborsClassifier()
    
    hyperparam = {'n_neighbors' : [1,2,3,4,5,8,11,16,2132,64,128]}
    
    random_knn = RandomizedSearchCV(knn, param_distributions= hyperparam, cv = 5, n_iter = 3, scoring = scoring, n_jobs = -1, random_state =123)
    
    random_knn.fit(x_train,y_train)
    print("Best Accuracy", random_knn.best_score_) # best_score_ digunakan untuk melihat score hasil cross validation
    print("Best Param", random_knn.best_params_)
    return random_knn 
    

In [137]:
best_knn = knn_fit(x_train, y_train)

Best Accuracy 0.9666666666666667
Best Param {'n_neighbors': 8}


In [138]:
knn = KNeighborsClassifier(n_neighbors = best_knn.best_params_.get('n_neighbors'))
# fitting KNN
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=8, p=2,
           weights='uniform')

In [139]:
def rbfSVC_fit(x_train, y_train, scoring = 'accuracy'):
    rbfSVC = SVC(kernel = 'rbf', random_state=123)

    hyperparam = {'C': [1000, 333.33, 100, 33.33, 10, 3.33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 
                        0.001, 0.00033, 0.0001], 
                  'gamma' : [10, 3.33, 1, 0.33, 0.1, 0.033, 0.01]}

    random_rbfSVC = RandomizedSearchCV(rbfSVC, param_distributions = hyperparam, cv = 3,
                                    n_iter = 6, scoring = scoring, n_jobs=-1, random_state = 123)
    
    random_rbfSVC.fit(x_train, y_train)
    
    print("Best Accuracy", random_rbfSVC.best_score_)
    print("Best Param", random_rbfSVC.best_params_)
    
    return random_rbfSVC 

In [140]:
best_rbfSVC = rbfSVC_fit(x_train, y_train) 

Best Accuracy 0.9583333333333334
Best Param {'gamma': 0.1, 'C': 1000}


In [141]:
rbfSVC = SVC(kernel = 'rbf', C = best_rbfSVC.best_params_.get('C'), 
             gamma = best_rbfSVC.best_params_.get('gamma'), random_state = 123)
rbfSVC.fit(x_train, y_train)

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False)

In [143]:
from sklearn.externals import joblib

joblib.dump(knn, 'knn.pkl')
# # joblib.dump(logreg, 'model/logreg.pkl')
# # joblib.dump(linSVC, 'model/linearSVC.pkl')
joblib.dump(rbfSVC, 'rbfSVC.pkl')
# # joblib.dump(decTree, 'model/decisionTree.pkl')
# # joblib.dump(bagging, 'model/bagging.pkl')
# # joblib.dump(randForest, 'model/randomForest.pkl')
# # joblib.dump(adaboost, 'model/adaboost.pkl')
# # joblib.dump(gradBoost, 'model/gradientBoosting.pkl')

['rbfSVC.pkl']

In [144]:
def testData(data,  standard):
    x_valid = data
    x_valid_transform = pd.DataFrame(standard.transform(x_valid), index = data.index) # standardization
    x_valid_transform.columns = x_valid.columns # samakan nama column
        
    return x_valid_transform

x_test = testData(x_test, standard = normalize)

In [145]:

x_test.head()

Unnamed: 0,splength,spwdith,ptlength,ptwidth
72,0.532218,-1.232943,0.626831,0.374151
112,1.149879,-0.093088,0.97534,1.185563
132,0.65575,-0.54903,1.033425,1.320798
88,-0.332507,-0.093088,0.162153,0.10368
37,-1.197233,0.134883,-1.34805,-1.519144


In [148]:
def testPred(x_test, y_test, classifier, compute_score = True):
    if compute_score == True:
        score = classifier.score(x_test, y_test)
        print("Accuracy", score)
        
    valid_proba = pd.DataFrame(classifier.predict_proba(x_test)) # hasil prediksi
    
    return  valid_proba, score

In [149]:
classifiers = [knn, rbfSVC]
label = ['KNN', 'RBF SVC']
for clf, i in zip(classifiers, label):
    print(i, clf.score(x_test, y_test))

KNN 0.9
RBF SVC 0.9333333333333333
