In [1]:
import ast
import scipy.io
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn import neighbors, datasets
import statistics
import pandas as pd
import pickle

In [2]:
data_dat = ['australian.dat','bupa.dat','glass.dat']
cv_filename = ['cv_australian.mat','cv_bupa.mat','cv_glass.mat']

# KNN,Naive Bayes,LogisticRegression algorithms

In [3]:
models = [ LogisticRegression(random_state=0),GaussianNB(),neighbors.KNeighborsClassifier(n_neighbors=10,p = 2)]

In [4]:
def training_models(models,features_train,targets_train,features_test):
    meta_proba = np.zeros((len(models) * 2, features_test.shape[0]))
    for i in range(len(models)):
        learner = models[i]
        learner.fit(features_train,targets_train)
        predictions_proba = learner.predict_proba(features_test)
        meta_proba[2*i][:] = predictions_proba.T[0]
        meta_proba[2*i + 1][:] = predictions_proba.T[1]
    meta_proba = meta_proba.transpose()
    return meta_proba

# Caculate Mean and Variance

In [5]:
def combining_sum_product(matrix1, matrix2, matrix3):
    #Combining Algorithms use Sum Rules
    combining_sum_rule = matrix1 + matrix2 + matrix3
    combining_sum_rule = combining_sum_rule * 1/3   #Mean elements in matrix
    #Combining algorithms PRODUCT RULES
    combining_product_rule =  matrix1 * matrix2 * matrix3 * 1/3
    return combining_sum_rule,combining_product_rule

In [6]:
#Combining algorithms MIN RULES
#Function compare element row in matrix => Min value 
def min_element(value1, value2, value3):
    min_value = min([value1, value2, value3])
    return min_value 

def row_in_matrix(i,matrix_1, matrix_2, matrix_3):
    min_value_class1 = min_element(value1=matrix_1[i][0], value2=matrix_2[i][0], value3= matrix_3[i][0])
    min_value_class2 = min_element(value1=matrix_1[i][1], value2=matrix_2[i][1], value3= matrix_3[i][1])
    return min_value_class1, min_value_class2
    
def min_rule(matrix_1, matrix_2, matrix_3):
    combining_min_rule = np.zeros((matrix_1.shape))
    
    #Store variables to combining matrix
    for i in range(len(matrix_1)):
        combining_min_rule[i] = row_in_matrix(i, matrix_1=matrix_1,
                                         matrix_2=matrix_2,matrix_3=matrix_3)
    return combining_min_rule


In [7]:
#Combining algorithms MAX RULES
#Create variables to store combining matrix
def max_element(value1, value2, value3):
        max_value = max([value1, value2, value3])
        return max_value 
#Function compare element row in matrix => Max value 
def row_in_matrix(i,matrix_1, matrix_2, matrix_3):
        max_value_class1 = max_element(value1=matrix_1[i][0], value2=matrix_2[i][0], value3= matrix_3[i][0])
        max_value_class2 = max_element(value1=matrix_1[i][1], value2=matrix_2[i][1], value3= matrix_3[i][1])
        return max_value_class1, max_value_class2
    
def max_rule(matrix_1, matrix_2, matrix_3):
    combining_max_rule = np.zeros((matrix_1.shape))
    
    #Store variables to combining matrix
    for i in range(len(matrix_1)):
        combining_max_rule[i] = row_in_matrix(i, matrix_1=matrix_1,
                                             matrix_2=matrix_2,matrix_3=matrix_3)
    return combining_max_rule


In [8]:
def target(combining_matrix):
    targets_combining_algorithm = []
    for row in combining_matrix:
        result = 1 if row[0] > row[1] else 2
        targets_combining_algorithm.append(result)
    targets_combining_algorithm = np.asarray(targets_combining_algorithm)
    return targets_combining_algorithm
    
def error_combining_rule(target_combining_rule, target_test):
    boolen_result = []
    for i in range(len(target_test)):
        result = 1 if target_combining_rule[i] != target_test[i] else 0
        boolen_result.append(result)
    mean_combining_rule = statistics.mean(boolen_result)
#     variance_combining_rule = statistics.variance(boolen_result)
    return mean_combining_rule

In [9]:
arrErrSum = []
arrErrProd = []
arrErrMax= []
arrErrMin = []

In [10]:
def process_data_dat(filename_dat):
    samples = []
    data_set = [i.strip().split() for i in open("../data/" + filename_dat).readlines()] 
    for sample in data_set:
        res = ast.literal_eval(sample[0])
        sample = list(res)
        samples.append(sample)
    samples = np.asarray(samples)
    return samples

def process_cv_filename(cv_file):
    cv_mat = scipy.io.loadmat('../data/' + cv_file)
    return cv_mat['cv']

def split_train_test_by_id(data,cv_file, niters, nfolds):
    for i in range(niters):
        for j in range(nfolds):
            train_index = []
            cv_test = process_cv_filename(cv_file)
            test_index = cv_test[0][i*nfolds + j]
            test_index = np.concatenate(([i-1 for i in test_index]))
#             print("LOOP:",i*nfolds + j)
            train_index.append([i for i in range(len(data)) if i not in test_index]) 
            train_index = np.asarray(train_index[0])
#             print(data[train_index])
#             print(data[test_index])
            meta_proba = training_models(models,features_train=data[train_index][:,0:data.shape[1] - 1],
                               targets_train=data[train_index][:,-1], features_test=data[test_index][:,0:data.shape[1] - 1])
            predict_lr_proba = meta_proba[:,0:2]
            predict_gnb_proba = meta_proba[:,2:4]
            predict_knn_proba = meta_proba[:,4:6]
            #Caculate Mean and Variance by Sum Rules:
            combining_sum_rule,combining_product_rule = combining_sum_product(predict_lr_proba,predict_gnb_proba,
                                                                             predict_knn_proba)
            targets_combining_sum_rule = target(combining_sum_rule)
            mean_combining_sum_rule = error_combining_rule(targets_combining_sum_rule, data[test_index][:,-1] )
            arrErrSum.append(mean_combining_sum_rule)
            
            #Caculate Mean and Variance by Product Rules:
            targets_combining_product_rule = target(combining_product_rule)
            mean_combining_product_rule = error_combining_rule(targets_combining_product_rule, data[test_index][:,-1] )
            arrErrProd.append(mean_combining_product_rule)
            
            #Caculate Mean and Variance by Min Rules:
            combining_min_rule = min_rule(predict_lr_proba,predict_gnb_proba,predict_knn_proba)
            targets_combining_min_rule = target(combining_min_rule)
            mean_combining_min_rule = error_combining_rule(targets_combining_min_rule, data[test_index][:,-1] )
            arrErrMin.append(mean_combining_min_rule)
            
            #Caculate Mean and Variance by Max Rules:
            combining_max_rule = max_rule(predict_lr_proba,predict_gnb_proba,predict_knn_proba)
            targets_combining_max_rule = target(combining_max_rule)
            mean_combining_max_rule = error_combining_rule(targets_combining_max_rule, data[test_index][:,-1] )
            arrErrMax.append(mean_combining_max_rule)
#     #Caculate mean and variance Total Sum Rule
#     mean_sum = statistics.mean(arrErrSum)
#     variance_sum = statistics.variance(arrErrSum)
#     #Caculate mean and variance Total Product Rule
#     mean_product = statistics.mean(arrErrProd)
#     variance_product = statistics.variance(arrErrProd)
#     #Caculate mean and variance Total Min Rule
#     mean_min = statistics.mean(arrErrMin)
#     variance_min = statistics.variance(arrErrMin)
#     #Caculate mean and variance Total Max Rule
#     mean_max = statistics.mean(arrErrMax)
#     variance_max = statistics.variance(arrErrMax)
    
    pickle_file = {'Dataset':cv_file,'arrErrSum':arrErrSum,'arrErrProd':arrErrProd,
                   'arrErrMin':arrErrMin,'arrErrMax':arrErrMax}
#     pickle_file = {'Dataset':cv_file,'mean_sum':mean_sum,'variance_sum':variance_sum,
#                    'mean_product':mean_product,'variance_product':variance_product,
#                   'mean_min':mean_min,'variance_min':variance_min,
#                   'mean_max':mean_max,'variance_max':variance_max}
    pickle_out = open("dict_{}.pickle".format(cv_file),"wb")
    pickle.dump(pickle_file, pickle_out)
    pickle_out.close()
    
    

In [11]:
dataframe = []
for i in range(len(data_dat)):
    data = process_data_dat(data_dat[i])
    split_train_test_by_id(data,cv_filename[i],niters=3,nfolds=10)
    pickle_in = open("dict_{}.pickle".format(cv_filename[i]),"rb")
    example_dict = pickle.load(pickle_in)
    dataframe.append(example_dict)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [12]:
import pandas as pd
data = pd.DataFrame.from_dict(dataframe)
data

Unnamed: 0,Dataset,arrErrSum,arrErrProd,arrErrMin,arrErrMax
0,cv_australian.mat,"[0.14492753623188406, 0.3188405797101449, 0.18...","[0.14492753623188406, 0.3188405797101449, 0.18...","[0.15942028985507245, 0.3188405797101449, 0.20...","[0.15942028985507245, 0.3188405797101449, 0.20..."
1,cv_bupa.mat,"[0.14492753623188406, 0.3188405797101449, 0.18...","[0.14492753623188406, 0.3188405797101449, 0.18...","[0.15942028985507245, 0.3188405797101449, 0.20...","[0.15942028985507245, 0.3188405797101449, 0.20..."
2,cv_glass.mat,"[0.14492753623188406, 0.3188405797101449, 0.18...","[0.14492753623188406, 0.3188405797101449, 0.18...","[0.15942028985507245, 0.3188405797101449, 0.20...","[0.15942028985507245, 0.3188405797101449, 0.20..."


In [13]:
def mean_rules(cv_file,arrErrSum,arrErrProd,arrErrMin,arrErrMax):
    #Caculate mean and variance Total Sum Rule
    mean_sum = statistics.mean(arrErrSum)
    variance_sum = statistics.variance(arrErrSum)
    #Caculate mean and variance Total Product Rule
    mean_product = statistics.mean(arrErrProd)
    variance_product = statistics.variance(arrErrProd)
    #Caculate mean and variance Total Min Rule
    mean_min = statistics.mean(arrErrMin)
    variance_min = statistics.variance(arrErrMin)
    #Caculate mean and variance Total Max Rule
    mean_max = statistics.mean(arrErrMax)
    variance_max = statistics.variance(arrErrMax)
    dict_data = {'Dataset':cv_file,'mean_sum':mean_sum,'variance_sum':variance_sum,
                   'mean_product':mean_product,'variance_product':variance_product,
                  'mean_min':mean_min,'variance_min':variance_min,
                  'mean_max':mean_max,'variance_max':variance_max}
    return dict_data

In [14]:
dict_data = []
for i in range(len(cv_filename)):
    dict_data.append(mean_rules(cv_filename[i],data['arrErrSum'][i],data['arrErrProd'][i],
                               data['arrErrMin'][i],data['arrErrMax'][i]))

In [15]:
import pandas as pd
print('Bảng 1: Classification error của các fixed combining rule')
data_1 = pd.DataFrame.from_dict(dict_data)
data_1

Bảng 1: Classification error của các fixed combining rule


Unnamed: 0,Dataset,mean_sum,variance_sum,mean_product,variance_product,mean_min,variance_min,mean_max,variance_max
0,cv_australian.mat,0.211111,0.001893,0.214976,0.001957,0.218357,0.002244,0.218357,0.002244
1,cv_bupa.mat,0.254323,0.005015,0.256269,0.005176,0.263787,0.00573,0.263787,0.00573
2,cv_glass.mat,0.341963,0.021246,0.350981,0.022964,0.351447,0.022003,0.351447,0.022003


In [16]:
data['arrErrSum'][0]

[0.14492753623188406,
 0.3188405797101449,
 0.18840579710144928,
 0.17391304347826086,
 0.17391304347826086,
 0.18840579710144928,
 0.2898550724637681,
 0.15942028985507245,
 0.15942028985507245,
 0.2318840579710145,
 0.17391304347826086,
 0.2753623188405797,
 0.21739130434782608,
 0.2318840579710145,
 0.2608695652173913,
 0.2463768115942029,
 0.21739130434782608,
 0.2463768115942029,
 0.17391304347826086,
 0.18840579710144928,
 0.17391304347826086,
 0.2608695652173913,
 0.2463768115942029,
 0.17391304347826086,
 0.18840579710144928,
 0.21739130434782608,
 0.15942028985507245,
 0.21739130434782608,
 0.2318840579710145,
 0.2028985507246377]

In [34]:
def win_compare_error(array1, array2):
    win_result = []
    for i in range(len(array1)):
        win_result.append(1 if array1[i] < array2[i] else 0)
    return sum(win_result)

def equal_compare_error(array1, array2):
    equal_result = []
    for i in range(len(array1)):
        equal_result.append(1 if array1[i] = array2[i] else 0)
    return sum(equal_result)

def loss_compare_error(array1, array2):
    loss_result = []
    for i in range(len(array1)):
        loss_result.append(1 if array1[i] = array2[i] else 0)
    return sum(loss_result)

In [35]:
win_compare_error(data['arrErrSum'][0],data['arrErrProd'][0])

6

In [18]:
import scipy
#Wilcoxon
print("Wilcoxon Sum vs Product Rules:",scipy.stats.wilcoxon(data['arrErrSum'][0],data['arrErrProd'][0],zero_method='wilcox'))
print("Wilcoxon Sum vs Min Rules:",scipy.stats.wilcoxon(data['arrErrSum'][0],data['arrErrMin'][0],zero_method='wilcox'))
print("Wilcoxon Sum vs Max Rules:",scipy.stats.wilcoxon(data['arrErrSum'][0],data['arrErrMax'][0],zero_method='wilcox'))

Wilcoxon Sum vs Product Rules: WilcoxonResult(statistic=0.0, pvalue=0.023140931308743732)
Wilcoxon Sum vs Min Rules: WilcoxonResult(statistic=27.0, pvalue=0.00830538569425914)
Wilcoxon Sum vs Max Rules: WilcoxonResult(statistic=27.0, pvalue=0.00830538569425914)


