In [1]:
import csv
import numpy as np
import ast
import json
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Preapre Data

In [3]:
data_train_austradian = [i.strip().split() for i in open("australian_train1.dat").readlines()] 
data_test_austradian = [i.strip().split() for i in open("australian_test.dat").readlines()]

In [4]:
def process_feature(observe):
    res = ast.literal_eval(observe[0])
    feature_observe = list(res)[:len(observe) - 2]
    label_observe = list(res)[-1]
    return feature_observe,label_observe

In [5]:
labels_train = []
features_train = []
for i, sample in enumerate(data_train_austradian):
    feature_train_sample, label_train_sample = process_feature(sample)
    features_train.append(feature_train_sample)
    labels_train.append(label_train_sample)
# Covert to array
features_train = np.asarray(features_train)    #Shape matrix 386 * 14
labels_train = np.asarray(labels_train)        #Shape matrix 386 * 1

# Process Test Data
features_test = []
labels_test = []
for i, sample in enumerate(data_test_austradian):
    feature_test_sample, label_test_data = process_feature(sample)
    features_test.append(feature_test_sample)
    labels_test.append(label_test_data)
#Covert to array
features_test = np.asarray(features_test)      #Shape matrix 207 * 14
labels_test = np.asarray(labels_test)          #Shape matrix 207 * 1
print(features_test.shape)
print(labels_test.shape)

(207, 14)
(207,)


# KNN,Naive Bayes,LogisticRegression algorithms

In [6]:
# KNN algorithms
clf = neighbors.KNeighborsClassifier(n_neighbors=10,p = 2)
clf.fit(features_train,labels_train)
label_predict_knn_prob = clf.predict_proba(features_test)# probability in class
label_predict_knn = clf.predict(features_test) 

In [7]:
#Naive Bayes algorithm
gnb = GaussianNB()
gnb.fit(features_train,labels_train)
label_predict_gnb = gnb.predict(features_test)
label_predict_gnb_prob = gnb.predict_proba(features_test)

In [8]:
#LogisticRegression algorithms
clf = LogisticRegression(random_state=0)
clf.fit(features_train,labels_train)
label_predict_lr = clf.predict(features_test)
label_predict_lr_prob = clf.predict_proba(features_test)

# Cross Validation K Folds

In [9]:
def process_dataset(observe):
    res = ast.literal_eval(observe[0])
    return list(res)

In [10]:
austradian = []
for sample in data_train_austradian:
    austradian.append(process_dataset(sample))
austradian = np.asarray(austradian)

In [11]:
models = [
    ('LR', LogisticRegression(random_state=0)),
    ('GB', GaussianNB()),
    ('KNN', neighbors.KNeighborsClassifier(n_neighbors=10,p = 2))
]

In [12]:
def model(models, train_set, label_set, validation_set):
    result_list = []
    for name,model in models:
        clf = model 
        clf.fit(train_set,label_set) 
        result = clf.predict_proba(validation_set)
        result_tmp = result_list.append(result)
    return result_list

In [13]:
from sklearn.model_selection import KFold
kfold = KFold(10, True, 1)
final_matrix = []
for train_index,test_index in kfold.split(austradian):
    print('train: %s, test: %s'%(austradian[train_index].shape,               #matrix data shape = 347 * 15 (contain 14 col features
                                austradian[test_index].shape))          #and 1 label of feature in last column)
    train_set = austradian[train_index][:,0:austradian[train_index].shape[1] - 1]     # Get features
    label_set = austradian[train_index][:,-1]                                         # Get label in last column
    validation_set = austradian[test_index][:,0:austradian[test_index].shape[1] - 1]
    sample_proba_validation = model(models=models,train_set=train_set,
                                    label_set=label_set,validation_set=validation_set)
    result_fold = np.concatenate((sample_proba_validation[0],sample_proba_validation[1],sample_proba_validation[2]),axis = 1)
    final_matrix.append(result_fold)

train: (347, 15), test: (39, 15)
train: (347, 15), test: (39, 15)
train: (347, 15), test: (39, 15)
train: (347, 15), test: (39, 15)
train: (347, 15), test: (39, 15)
train: (347, 15), test: (39, 15)
train: (348, 15), test: (38, 15)
train: (348, 15), test: (38, 15)
train: (348, 15), test: (38, 15)
train: (348, 15), test: (38, 15)


In [23]:
features_train.shape

(386, 14)

In [24]:
# Create the training metadata