# AutoFIS code experimenting

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# data processing
import pandas as pd
import numpy as np
import scipy.stats as sp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.model_selection import StratifiedKFold


# data visualization
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('seaborn')

# other tools
from utils import dataframe_utils, preprocess_utils, feature_importance_utils, model_utils, preprocess_utils
from joblib import dump, load

from sklearn import datasets

## Importing benchmark dataset Iris

In [8]:
iris = datasets.load_iris()
df_iris = pd.DataFrame(iris['data'])
df_iris['target'] = iris['target']
df_iris

Unnamed: 0,0,1,2,3,target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [9]:
X = iris['data']
y = iris['target']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.30, random_state=42)

### Fuzzification

In [16]:
from autoFIS.autoFIS.lecture import Lecture
from autoFIS.autoFIS.fuzzification import Fuzzification
from autoFIS.autoFIS.formulation.formulation import Formulation

01 - Prepare Data (this step will be eliminated in the future)

In [17]:
reader = Lecture()
_, target_class_one_hot, frequency_of_classes, count_of_classes, _ = reader.calculate_parameters(df_iris)
# X, cBin, fClasses, dictFreq, _ = reader.calculate_parameters(df_iris)
# Pelo oq eu entendi aqui, tem q fazer o one hot da classe de saída também.

# Talvez, vale a pena criar um Encoder com One Hot para os atributos categóricos e para a saída
# E encapsular isso. Algo semelhante a esse módulo de leitura de dados, mas melhor.

Fuzzification (parameters are set by the user)

In [18]:
categorical_attributes = [0, 0, 0, 0]  # <<=====
fuzzification_type = 'normal'  # "tukey", "normal"
fuzzy_sets_by_attribute = 3  # 3, 5, 7
enable_negation = False

# -------------------------
# Formulation parameters
# -------------------------
# ordem_max_premises = 2
max_size_of_premise = 2
t_norm = 'prod'  # "min", "prod"
# Area filter parameters:
criteria_area = "cardinalidade_relativa"  # "cardinalidade_relativa", "frequencia_relativa"
area_threshold = 0.05
# PCD filter parameter:
is_enable_pcd = [1, 0]
# Overlapping filter parameters:
is_enable_overlapping = [1, 1]
overlapping_threshold = 0.95

In [19]:
fuzzifier = Fuzzification()
fuzzifier.build_membership_functions(X, categorical_attributes,fuzzification_type, fuzzy_sets_by_attribute, enable_negation)

Formulation

In [20]:
par_area = [criteria_area, area_threshold]
par_over = [is_enable_overlapping, overlapping_threshold]
par_pcd = is_enable_pcd

In [21]:
f2 = Formulation(ux_train, cbin_train, ref_attributes, premises_by_attribute,
                         num_premises_by_attribute, premises_contain_negation)

# Inputs given by user
arbol = f2.generate_premises(max_size_of_premise, t_norm, par_area, par_over, par_pcd)

status = [0 if not i[0] else 1 for i in arbol]
sum_status = sum(status)
if sum_status != len(arbol):
    if sum_status == 0:
        raise ValueError("Error in Formulation Module. Any premise survived. "
                            "Sorry, you can not continue in the next stage."
                            "\nTry to change the configuration")
    else:
        arb = [i for i in arbol if i[0]]
        arbol, arb = arb, arbol
print('Done with Formulation...')
number_classes = cbin_train.shape[1]

report.append("\nFormulation:\n-----------------")
report.append("Elementos acorde a la profundidad " + str(len(arbol)) + " del arbol")
for i in range(len(arbol)):
    report.append('Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape))
    # print 'Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape)

NameError: name 'ux_train' is not defined

In [115]:
import os
from senfis.autoFIS.autoFIS.autoFIS_one_zip import cv_onezip
import timeit


# # # # # # # 
# Run autoFIS old method
# # # # # # # 

def define_parameters(pars):
    try:
        parameters = [categorical_mask] + pars  # addition of 2 list
    except KeyError:
        print ("The database " + database_name + " was not found.\nIt was assumed that all attributes are numeric")
        parameters = [0]
    return parameters


def run_autofis(current_folder_path):
    # -------------------------
    # Fuzzification parameters
    # -------------------------
    # categorical_bool_attributes = [0, 0, 0, 0, 1, 0, 0, 0, 0]  # <<=====
    triangular_fuzzification_type = "tukey"  # "tukey", "normal"
    num_partitions_by_attribute = 3  # 3, 5, 7
    is_enable_negation = 0  # 0, 1

    # -------------------------
    # Formulation parameters
    # -------------------------
    t_norm = "prod"  # "min", "prod"
    ordem_max_premises = 2
    # Area filter parameters:
    criteria_area = "cardinalidade_relativa"  # "cardinalidade_relativa", "frequencia_relativa"
    area_threshold = 0.1
    # PCD filter parameter:
    is_enable_pcd = [1, 0]
    # Overlapping filter parameters:
    is_enable_overlapping = [1, 1]
    overlapping_threshold = 0.95

    # -------------------------
    # Association: ex Splitting
    # -------------------------
    method_association = "CD"  # "MQR", "PMQR", "CD", "PCD", "freq"

    # -------------------------
    # Aggregation
    # -------------------------
    method_aggregation = "MQR"  # "MQR", "PMQR", "intMQR", "CD", "PCD", "max"

    # %%%%%%%%%%%%%%%%%%%%%%%%%%
    # %% Grouping parameters: %%
    # %%%%%%%%%%%%%%%%%%%%%%%%%%
    parameters = [triangular_fuzzification_type, num_partitions_by_attribute, t_norm, is_enable_negation,
                  ordem_max_premises, criteria_area, area_threshold, is_enable_pcd,
                  is_enable_overlapping, overlapping_threshold,
                  method_association, method_aggregation]

    # =============================================================================================== #
    # current_folder_path = os.path.dirname(os.path.realpath(__file__))
    current_folder_path = current_folder_path
    print (current_folder_path)

    databases = []

    for archivo in os.listdir(current_folder_path):
        if archivo.endswith("_csv.zip"):
            databases.append(archivo)
    print (databases)
    # =============================================================================================== #

    file_times = open(os.path.join(current_folder_path, "Experimento_times.csv"), 'w')
    file_times.write("Dataset" + ", " + "Time(s)" + '\n')
    file_times.close()
    # Evaluate each database (zip file)
    for data in databases:
        t0 = timeit.default_timer()
        try:
            # data_name_key = data[0:-16]
            parameters_database = define_parameters(parameters)
            achievement = cv_onezip(current_folder_path, data, parameters_database)
            if achievement == 0:
                raise ValueError("Problems in database: " + "<" + data + ">")
        except ValueError as e:
            print (e)
            achievement = 0
        tf = timeit.default_timer()

        if achievement:
            file_times = open(os.path.join(current_folder_path, "Experimento_times.csv"), 'a')
            file_times.write(data[:-16] + ', ' + str(tf - t0) + '\n')
            file_times.close()

In [9]:
# Este arquivo executa o processamento de uma base de dados, utilizando validação cruzada.
# A base de dados (um arquivo zip) já é separada em 10 splits (em csv) para a validação cruzada.

__author__ = 'jparedes'

import os
import zipfile
from .autoFIS_one_cv import autofis_onecv
from numpy import mean, std


def cv_onezip(path_databases, zip_file_name, parameters, path_output=0):
    # Este arquivo executa o processamento de uma base de dados, utilizando validação cruzada.
    # A base de dados (um arquivo zip) já é separada em 10 splits (em csv) para a validação cruzada.
    if path_output == 0:
        path_output = path_databases
    zipFilePath = os.path.join(path_databases, zip_file_name)

    # ==================================================================== #
    try:
        with zipfile.ZipFile(zipFilePath, 'r') as z:
            files_cv = z.namelist()

        number_files_zip = len(files_cv)
        if not (number_files_zip == 20 or number_files_zip == 10):
            raise ValueError("This module works with a zip file to 10cv or 5cv. "
                             "For this reason, it is expected 20 or 10 files inside the zip file")
        elif number_files_zip == 20:
            a = files_cv[2:] + files_cv[0:2]
        else:  # number_files_zip == 10
            a = files_cv

        list_train, list_test = a[::2], a[1::2]

        msg = []
        number_cv_pairs = int(number_files_zip / 2)
        ac_train = number_cv_pairs * [0]
        ac_test = number_cv_pairs * [0]
        auc_train = number_cv_pairs * [0]
        auc_test = number_cv_pairs * [0]

        num_rules = number_cv_pairs * [0]
        total_rule_length = number_cv_pairs * [0]

        for i in range(number_cv_pairs):
            print('Fold nº: ',i)
            train_file = list_train[i]
            test_file = list_test[i]

            message, indicators = autofis_onecv(zipFilePath, train_file, test_file, parameters)
            msg.append(message)

            if indicators[0] == 0:
                name_error = os.path.join(path_output, 'ERROR') + zip_file_name[:-13]
                fail_error = open(name_error, 'w')
                fail_error.write('Error in CV:' + str(i + 1))
                fail_error.write("\n" + message)
                fail_error.close()
                raise ValueError("Problem detected in CV " + str(i + 1))

            ac_train[i], ac_test[i] = indicators[1][0], indicators[1][1]
            auc_train[i], auc_test[i] = indicators[1][2], indicators[1][3]
            num_rules[i] = indicators[1][4][0]
            total_rule_length[i] = indicators[1][4][1]

        filename = os.path.join(path_output, 'Report of ') + zip_file_name[:-8]
        target = open(filename, 'w')
        target.write('Parameters: ' + str(parameters))
        for i2 in range(number_cv_pairs):
            target.write('\n\n' + str(4 * '===============================') + '\n\n')
            target.write('CV-' + str(i2 + 1) + '\n')
            target.write('\n'.join(msg[i2]))

        target.write('\n\n' + str(4 * '===============================') + '\n\n')
        target.write('Accuracy training: ' + str(mean(ac_train)) + ', ' + str(std(ac_train)) + '\n')
        target.write('Accuracy testing: ' + str(mean(ac_test)) + ', ' + str(std(ac_test)) + '\n')
        target.write('AUC training: ' + str(mean(auc_train)) + ', ' + str(std(auc_train)) + '\n')
        target.write('AUC testing: ' + str(mean(auc_test)) + ', ' + str(std(auc_test)) + '\n')
        target.write('Number of rules: ' + str(mean(num_rules)) + '\n')
        target.write('Total Rule Length: ' + str(mean(total_rule_length)))
        target.close()

        achievement = 1

        print ("win ", zip_file_name)

    except ValueError as e:
        print (e)
        achievement = 0

    return achievement


In [18]:
# Este arquivo executa o autoFIS para um fold da validação cruzada

import autoFIS.autoFIS.utils_autofis as toolfis
# import .utils_autofis as toolfis
from autoFIS.autoFIS.formul.autoFIS.formulation import Formulation
from autoFIS.autoFIS.association import Association
from autoFIS.autoFIS.aggregation import Aggregation
from autoFIS.autoFIS.decisions import Decisions
from autoFIS.autoFIS.evaluation import Evaluation


def autofis_onecv(file_zip, file_train, file_test, parameters):

    # General parameters
    t_norm = parameters[3]
    max_size_of_premise = parameters[5]
    association_method = parameters[11]
    aggregation_method = parameters[12]

    # Gathering parameters
    # Formulation parameters:
    par_area, par_over, par_pcd = toolfis.get_formulation_parameters(parameters)

    # 1. Lecture & Fuzzification
    out1 = toolfis.lecture_fuz_one_cv(file_zip, file_train, file_test, parameters)
    ux_train, cbin_train = out1[0]
    ux_test, cbin_test = out1[1]
    num_premises_by_attribute, premises_by_attribute, ref_attributes, premises_contain_negation = out1[2]
    freq_classes = out1[3]

    report = []  # To save our results

    try:
        # 3. Formulation
        f2 = Formulation(ux_train, cbin_train, ref_attributes, premises_by_attribute,
                         num_premises_by_attribute, premises_contain_negation)
        # Inputs given by user
        arbol = f2.gen_ARB(max_size_of_premise, t_norm, par_area, par_over, par_pcd)

        status = [0 if not i[0] else 1 for i in arbol]
        sum_status = sum(status)
        if sum_status != len(arbol):
            if sum_status == 0:
                raise ValueError("Error in Formulation Module. Any premise survived. "
                                 "Sorry, you can not continue in the next stage."
                                 "\nTry to change the configuration")
            else:
                arb = [i for i in arbol if i[0]]
                arbol, arb = arb, arbol
        print('Done with Formulation...')
        number_classes = cbin_train.shape[1]

        report.append("\nFormulation:\n-----------------")
        report.append("Elementos acorde a la profundidad " + str(len(arbol)) + " del arbol")
        for i in range(len(arbol)):
            report.append('Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape))
            # print 'Profundidad ' + str(i + 1) + ': ' + str(arbol[i][1].shape)

        # 4. Association: ex-Division
        f3 = Association(arbol, cbin_train)
        premises_ux_by_class = f3.division(association_method)

        status = [0 if not i[0] else 1 for i in premises_ux_by_class]
        if sum(status) != number_classes:
            raise ValueError("Error in Division Module. Some classes did not get premises. "
                             "Sorry, you can not continue in the next stage."
                             "\nTry to change the configuration")
        print('Done with Association...')

        # 5. Aggregation:
        f4 = Aggregation(premises_ux_by_class, cbin_train)
        output_aggregation = f4.aggregation(aggregation_method)

        premises_weights_names = output_aggregation[0]
        estimation_classes = output_aggregation[1]

        status = [0 if not i[0] else 1 for i in premises_weights_names]
        if sum(status) != number_classes:
            raise ValueError("Error in Aggregation Module. Some classes did not get premises. "
                             "Sorry, you can not continue in the next stage."
                             "\nTry to change the configuration")
        print('Done with Aggregation...')

        final_premises_classes = []
        report.append("\n\nPremises:\n=========")
        for i in range(len(premises_weights_names)):
            report.append("Premises of Class " + str(i) + ": " + str(premises_weights_names[i][0]))
            final_premises_classes.append(premises_weights_names[i][0])
            report.append("weights_" + str(i) + ": " + str(premises_weights_names[i][1].T))

        # 6. Decision:
        f5 = Decisions(estimation_classes, freq_classes)
        train_bin_prediction = f5.dec_max_pert()
        print('Done with Decision...')

        # 7. Evaluation
        f6 = Evaluation(premises_weights_names, final_premises_classes, freq_classes)
        metrics_train = f6.eval_train(cbin_train, train_bin_prediction)
        metrics_test = f6.eval_test(cbin_test, ux_test, t_norm)
        print('Done with Evaluation...')

        report.append("\nEvaluation Training:\n---------------------------")
        report.append("Accuracy on train dataset: " + str(metrics_train[0]))
        report.append("AUC in train dataset: " + str(metrics_train[1]))
        report.append("Recall: " + str(metrics_train[3]))
        report.append('Confusion matrix:\n' + str(metrics_train[2]))

        report.append("\nEvaluation Testing:\n---------------------------")
        report.append("Accuracy on test dataset: " + str(metrics_test[0]))
        report.append("AUC in test dataset: " + str(metrics_test[1]))
        report.append("Recall: " + str(metrics_test[3]))
        report.append("Confusion matrix:\n" + str(metrics_test[2]))

        # Metrics to eval: accuracy_test, auc_test,
        #                  [num_regras, total_rule_length, tamano_medio_das_regras]]
        metricas = [1, [metrics_train[0], metrics_test[0], metrics_train[1], metrics_test[1], metrics_test[4]]]

    except ValueError as e:
        print (e)
        report = e  # .append("\n" + str(e))
        metricas = [0, "No se termino el proceso, se detuvo en algun etapa"]

    return report, metricas


def main():
    filezip_name = "D:\\Jorg\Projects\\autoFIS\\test\\datas" + '\\' + 'saheart-10-fold_csv.zip'
    train_file = "saheart-10-7tra.csv"
    test_file = "saheart-10-7tst.csv"

    # -------------------------
    # Fuzzification parameters
    # -------------------------
    categorical_bool_attributes = [0, 0, 0, 0, 1, 0, 0, 0, 0]
    triangular_fuzzification_type = "normal"  # "tukey", "normal"
    num_partitions_by_attribute = 3
    t_norm = "min"  # "min", "prod"
    is_enable_negation = 0  # 0, 1

    # -------------------------
    # Formulation parameters
    # -------------------------
    ordem_max_premises = 2
    # Area filter parameters:
    criteria_area = "cardinalidade_relativa"  # "cardinalidade_relativa", "frequencia_relativa"
    area_threshold = 0.05
    # PCD filter parameter:
    is_enable_pcd = 1
    # Overlapping filter parameters:
    is_enable_overlapping = 1
    overlapping_threshold = 0.95

    # -------------------------
    # Association
    # -------------------------
    method_association = "MQR"  # "MQR", "PMQR", "CD", "PCD", "freq_max"

    # -------------------------
    # Aggregation
    # -------------------------
    method_aggregation = "MQR"  # "MQR", "PMQR", "CD", "PCD", "freq_max"

    # %%%%%%%%%%%%%%%%%%%%%%%%%%
    # %% Grouping parameters: %%
    # %%%%%%%%%%%%%%%%%%%%%%%%%%
    parameters = [categorical_bool_attributes, triangular_fuzzification_type,
                  num_partitions_by_attribute, t_norm, is_enable_negation,
                  ordem_max_premises, criteria_area, area_threshold, is_enable_pcd,
                  is_enable_overlapping, overlapping_threshold,
                  method_association, method_aggregation]

    result_1cv = autofis_onecv(filezip_name, train_file, test_file, parameters)
    print (result_1cv[1])


if __name__ == '__main__':
    main()

In [33]:
# Metodos auxiliares

def get_formulation_parameters(parameters):
    criteria_area = parameters[6]
    area_threshold = parameters[7]
    is_enable_pcd = parameters[8]
    is_enable_overlapping = parameters[9]
    overlapping_threshold = parameters[10]
    par_area = [criteria_area, area_threshold]
    par_over = [is_enable_overlapping, overlapping_threshold]
    par_pcd = is_enable_pcd
    return par_area, par_over, par_pcd


def lecture_fuz_one_cv(zipFilePath, file_train, file_test, parameters):
    # Parameters
    cat_bool, fz_type, fz_number_partition = parameters[0:3]
    is_enable_negation = parameters[4]
    # Lecture
    reader = Lecture()
    reader.read_1cv(zipFilePath, file_train, file_test)
    # [x, y_Bin, Freq_Class, Dic_Labels, Dic_Class, Index_Train]
    x, y_bin, freq_classes, _, _, index_train = reader.info_data()

    # Fuzzification
    matrix_x = x.copy()
    fuzzifier = Fuzzification(matrix_x, cat_bool)
    fuzzifier.build_uX(fz_type, fz_number_partition)
    if is_enable_negation == 1:
        fuzzifier.add_negation()
    print('Successful FZ')
    # Getting train and test partitions
    ux_train = fuzzifier.uX[:index_train, :]
    ux_test = fuzzifier.uX[index_train:, :]
    cbin_train = y_bin[:index_train, :]
    cbin_test = y_bin[index_train:, :]

    # Information about attributes fuzzification
    sizes_attributes = fuzzifier.num_of_premises_by_attribute  # [3, 2, 3]
    # [(0,1,2),(3,4),(5,6,7)]
    premises_by_attribute = fuzzifier.attribute_premises
    ref_attributes = fuzzifier.ref_attributes  # [0, 1, 2]
    premises_contain_negation = fuzzifier.indexes_premises_contain_negation

    fuz_train = [ux_train, cbin_train]
    fuz_test = [ux_test, cbin_test]
    attributes_information = [
        sizes_attributes, premises_by_attribute, ref_attributes, premises_contain_negation]
# return fuz_train, fuz_test, attributes_information, freq_classes, gain_by_att
    return fuz_train, fuz_test, attributes_information, freq_classes