In [1]:
import csv
import numpy as np
import ast
import json
import matplotlib.pyplot as plt
from sklearn import neighbors, datasets
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold

In [2]:
import warnings
warnings.filterwarnings('ignore')

# Preapre Data

In [3]:
data_train_austradian = [i.strip().split() for i in open("australian_train1.dat").readlines()] 
data_test_austradian = [i.strip().split() for i in open("australian_test.dat").readlines()]

In [4]:
def process_feature(observe):
    res = ast.literal_eval(observe[0])
    feature_observe = list(res)[:len(observe) - 2]
    label_observe = list(res)[-1]
    return feature_observe,label_observe

In [5]:
labels_train = []
features_train = []
for i, sample in enumerate(data_train_austradian):
    feature_train_sample, label_train_sample = process_feature(sample)
    features_train.append(feature_train_sample)
    labels_train.append(label_train_sample)
# Covert to array
features_train = np.asarray(features_train)    #Shape matrix 386 * 14
print("Features of Train:",features_train.shape)
labels_train = np.asarray(labels_train)        #Shape matrix 386 * 1
print("Labels of Train:",labels_train.shape)
# Process Test Data
features_test = []
labels_test = []
for i, sample in enumerate(data_test_austradian):
    feature_test_sample, label_test_data = process_feature(sample)
    features_test.append(feature_test_sample)
    labels_test.append(label_test_data)
#Covert to array
features_test = np.asarray(features_test)      #Shape matrix 207 * 14
print("Features of Test:",features_test.shape)
labels_test = np.asarray(labels_test)          #Shape matrix 207 * 1
print("Labels of Test:",labels_test.shape)

Features of Train: (386, 14)
Labels of Train: (386,)
Features of Test: (207, 14)
Labels of Test: (207,)


# KNN,Naive Bayes,LogisticRegression algorithms

In [6]:
# KNN algorithms
clf_knn = neighbors.KNeighborsClassifier(n_neighbors=10,p = 2)
clf_knn.fit(features_train,labels_train)
label_predict_knn_prob = clf_knn.predict_proba(features_test)# probability in class
label_predict_knn = clf_knn.predict(features_test) 

In [7]:
#Naive Bayes algorithm
clf_gnb = GaussianNB()
clf_gnb.fit(features_train,labels_train)
label_predict_gnb = clf_gnb.predict(features_test)
label_predict_gnb_prob = clf_gnb.predict_proba(features_test)

In [8]:
#LogisticRegression algorithms
clf_lr = LogisticRegression(random_state=0)
clf_lr.fit(features_train,labels_train)
label_predict_lr = clf_lr.predict(features_test)
label_predict_lr_prob = clf_lr.predict_proba(features_test)

# Cross Validation K Folds

In [9]:
def process_dataset(observe):
    res = ast.literal_eval(observe[0])
    return list(res)

In [10]:
austradian = []
for sample in data_train_austradian:
    austradian.append(process_dataset(sample))
austradian = np.asarray(austradian)
print("Data train =  features + label:",austradian.shape)

Data train =  features + label: (386, 15)


In [11]:
models = [ LogisticRegression(random_state=0),GaussianNB(),neighbors.KNeighborsClassifier(n_neighbors=10,p = 2)]
models

[LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='auto', n_jobs=None, penalty='l2',
                    random_state=0, solver='lbfgs', tol=0.0001, verbose=0,
                    warm_start=False),
 GaussianNB(priors=None, var_smoothing=1e-09),
 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                      weights='uniform')]

In [12]:
# Create the training metadata

In [13]:
features_train.shape

(386, 14)

In [14]:
# Create variables to store metadata and target
print("========== Create variables to store metadata ===================")
meta_data = np.zeros((len(models) * 2,len(features_train)))
print("meta data features:", meta_data.shape)
meta_targets = np.zeros(len(features_train))
print("meta data labels:", meta_targets.shape)
#Create the cross-validation folds
print("========== Create the cross-validation folds ====================")
KF = KFold(n_splits=10)
meta_index = 0
for train_indices, test_indices in KF.split(features_train):
    print("Size train:",features_train[train_indices].shape)
    print("Size test:",features_train[test_indices].shape)
    for i in range(len(models)):
        learner = models[i]
        learner.fit(features_train[train_indices], labels_train[train_indices])
        predictions = learner.predict_proba(features_train[test_indices])
        meta_data[2*i][meta_index:meta_index + len(test_indices)] = predictions.T[0]
        meta_data[2*i + 1][meta_index:meta_index + len(test_indices)] = predictions.T[1]
    meta_targets[meta_index:meta_index + len(test_indices)] = labels_train[test_indices]
    meta_index += len(test_indices)
#Transpose the metadata
meta_data = meta_data.transpose()
print("Meta data:",meta_data.shape)

meta data features: (6, 386)
meta data labels: (386,)
Size train: (347, 14)
Size test: (39, 14)
Size train: (347, 14)
Size test: (39, 14)
Size train: (347, 14)
Size test: (39, 14)
Size train: (347, 14)
Size test: (39, 14)
Size train: (347, 14)
Size test: (39, 14)
Size train: (347, 14)
Size test: (39, 14)
Size train: (348, 14)
Size test: (38, 14)
Size train: (348, 14)
Size test: (38, 14)
Size train: (348, 14)
Size test: (38, 14)
Size train: (348, 14)
Size test: (38, 14)
Meta data: (386, 6)


In [15]:
meta_data.shape

(386, 6)

In [16]:
from sklearn import svm
clf_svm = svm.SVC()
clf_svm.fit(meta_data, labels_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

# Test and evaluate

In [17]:
features_test

array([[1.0000e+00, 6.5420e+03, 1.1000e+01, ..., 2.0000e+00, 2.2000e+01,
        1.0000e+00],
       [1.0000e+00, 1.8170e+03, 1.0250e+03, ..., 2.0000e+00, 3.2000e+02,
        1.4000e+01],
       [1.0000e+00, 4.8500e+02, 4.2500e+02, ..., 2.0000e+00, 2.2500e+02,
        1.0000e+00],
       ...,
       [1.0000e+00, 4.1420e+03, 5.0000e+00, ..., 2.0000e+00, 4.7000e+02,
        1.0000e+00],
       [0.0000e+00, 2.5750e+03, 5.0000e+00, ..., 2.0000e+00, 4.9100e+02,
        1.0000e+00],
       [1.0000e+00, 4.7670e+03, 2.9000e+01, ..., 2.0000e+00, 0.0000e+00,
        1.5001e+04]])

In [18]:
combining_test_matrix = np.concatenate((label_predict_knn_prob,label_predict_gnb_prob,label_predict_lr_prob),axis = 1)

def model(models, train_set, label_set, validation_set):
    result_list = []
    for model in models:
        clf = model 
        clf.fit(train_set,label_set) 
        result = clf.predict_proba(validation_set)
        result_list.append(result)
    return result_list

result_list = model(models=models,train_set=features_train,label_set=labels_train,validation_set = features_test)

In [19]:
label_preict = clf_svm.predict(combining_test_matrix)

In [20]:
target_names = ['class 1','class 2']
print(classification_report(labels_test, label_preict, target_names=target_names))

              precision    recall  f1-score   support

     class 1       0.80      0.65      0.72        81
     class 2       0.80      0.90      0.85       126

    accuracy                           0.80       207
   macro avg       0.80      0.78      0.78       207
weighted avg       0.80      0.80      0.80       207

