In [1]:
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.float_format = "{:,.2f}".format

from experiments import *

import sys

sys.path.insert(1, '../fuzzylearn/')

from fuzzylearn import *
from fuzzylearn.fuzzifiers import LinearFuzzifier, CrispFuzzifier,ExponentialFuzzifier,QuantileLinearPiecewiseFuzzifier, QuantileConstantPiecewiseFuzzifier
from fuzzylearn.kernel import GaussianKernel, LinearKernel, HyperbolicKernel, PolynomialKernel, HomogeneousPolynomialKernel
from fuzzylearn import solve_optimization_gurobi

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer, RobustScaler, PowerTransformer, Normalizer
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV, cross_val_score

In [2]:
dataset = pd.read_excel("dataset/DATABASE_UNITO_modificato.xlsx")
dataset = dataset.set_index("VERBALE")

import datetime as dt

dataset.DATA = dataset.DATA.apply(lambda d: (d - dt.datetime(1970,1,1)).days)

dataset.head()

Unnamed: 0_level_0,DATA,SESSO,ANNI,PESO,ALTEZZA,BMI,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
VERBALE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
85567,10893,0,81,84.0,1.75,27.43,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
85829,10970,1,69,69.0,1.62,26.29,1,4,4,4,...,0,0,0,0,0,0,0,0,0,0
85977,11026,1,71,67.0,1.55,27.89,1,2,0,1,...,0,0,0,0,0,0,0,0,0,0
86220,11122,1,54,60.0,1.59,23.73,1,4,0,0,...,0,0,0,0,0,0,0,0,0,0
86247,11130,1,78,69.0,1.67,24.74,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
print(list(map(lambda x: x.split(':')[1],list(dataset.columns)[7:12])))

['Neurocranio', 'Splancnocranio', 'Telencefalo', 'Cervelletto', 'Tronco encefalico']


## Consistenza dei dati

In [6]:
def check_column(predicate,column_name,values):
    correct = all(map(predicate,values))
    if not correct: print(column_name)

### Lesioni e Totali

In [7]:
lesioni_da_controllare = list(dataset.columns[32:])

In [8]:
for column_name in lesioni_da_controllare:
    check_column(lambda x: (x>=0 and x<=4) or ("tot" in column_name.lower()),column_name,dataset[column_name].values)

I costa dx
II costa dx
III costa dx
IV costa dx
V costa dx
VI costa dx
VII costa dx
VIII costa dx
IX costa dx
X costa dx
XI costa dx
XII costa dx
I costa sx
II costa sx
III costa sx
IV costa sx
V costa sx
VI costa sx
VII costa sx
VIII costa sx
IX costa sx
X costa sx
XI costa sx
XII costa sx


Le uniche colonne corrispondenti a lesioni che hanno valori fuori dal range sono quelle legate alle coste che sono in realtà dei totali.

### Unicità indice

In [9]:
len(dataset.index.unique()) == len(dataset.index)

True

### Sesso, Mezzo

In [10]:
check_column(lambda x: x==0 or x==1,'SESSO',dataset['SESSO'].values)

In [11]:
check_column(lambda x: x==0 or x==1,'Mezzo',dataset['Mezzo'].values)

### Anagrafica

In [12]:
for col in list(dataset.columns[2:6]):
    check_column(lambda x: x>0,col,dataset[col].values)

## Esperimenti

In [13]:
import logging

# create logger
f_logger = logging.getLogger(__name__)
f_logger.setLevel(logging.INFO)

# create console handler and set level to debug
file = logging.FileHandler('esperimenti_all_columns.log')
file.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s -%(message)s')

# add formatter to ch
file.setFormatter(formatter)

# add ch to logger
f_logger.addHandler(file)

In [14]:
import json

"""
with open('esperimenti_all_columns.json', "w") as write_file:
    json.dump({}, write_file)
"""

'\nwith open(\'esperimenti_all_columns.json\', "w") as write_file:\n    json.dump({}, write_file)\n'

In [15]:
mu1 = dataset['Mezzo'].values
mu0 = np.asarray(list(map(lambda x: 0 if x else 1,mu1)))

In [16]:
columns_lesioni = list(filter(lambda x: 'tot' not in x.lower(),list(dataset.columns[7:])))[:66]

In [17]:
fuzzifiers_class = [LinearFuzzifier,ExponentialFuzzifier, CrispFuzzifier, QuantileConstantPiecewiseFuzzifier, QuantileLinearPiecewiseFuzzifier]
fuzzifiers_to_string = {LinearFuzzifier: "LinearFuzzifier",
                        ExponentialFuzzifier:"ExponentialFuzzifier",
                        CrispFuzzifier: "CrispFuzzifier",
                        QuantileConstantPiecewiseFuzzifier: " QuantileConstantPiecewiseFuzzifier",
                        QuantileLinearPiecewiseFuzzifier: "QuantileLinearPiecewiseFuzzifier"}

In [18]:
sigmas = np.arange(.1,1,.1)

params_grid = {
    'c': [0.021544346900318846],
    'k': [LinearKernel()] + [GaussianKernel(sigma) for sigma in sigmas]
}

In [19]:
for c,l in zip([20,40,50,66],['LesioniAllNoCoste20','LesioniAllNoCoste40','LesioniAllNoCoste50','LesioniAllNoCoste66']):
    dim_red = PCA(n_components=c)
    tnc = incidenti_fuzzifier_table(dataset,[columns_lesioni],fuzzifiers_class,FuzzyInductor,mu1,params_grid,3,3,logger=f_logger,dim_reduction=dim_red,file_json='json_result/fuzzifiers_all.json',labels=[l])
    tnc.to_json('json_result/fuzzifiers_all.json')

In [21]:
for c,l in zip([20,40,50,66],['LesioniAllNoCoste20_0','LesioniAllNoCoste40_0','LesioniAllNoCoste50_0','LesioniAllNoCoste66_0']):
    dim_red = PCA(n_components=c)
    tnc0 = incidenti_fuzzifier_table(dataset,[columns_lesioni],fuzzifiers_class,FuzzyInductor,mu0,params_grid,3,3,logger=f_logger,dim_reduction=dim_red,file_json='json_result/fuzzifiers_all0.json',labels=[l])
    tnc0.to_json('json_result/fuzzifiers_all0.json')

In [23]:
columns_lesioni_all = list(filter(lambda x: 'tot' not in x.lower() and 'costa' not in x.lower(),list(dataset.columns[7:])))

In [41]:
for c,l in zip([130],['LesioniAll130']):
    dim_red = PCA(n_components=c)
    tnc = incidenti_fuzzifier_table(dataset,[columns_lesioni_all],fuzzifiers_class,FuzzyInductor,mu1,params_grid,3,3,logger=f_logger,dim_reduction=dim_red,file_json='json_result/fuzzifiers_all.json',labels=[l])
    tnc.to_json('json_result/fuzzifiers_all.json')

In [38]:
tnc = pd.read_json('json_result/fuzzifiers_all.json')
tnc

Unnamed: 0,LesioniAllNoCoste20,LesioniAllNoCoste40,LesioniAllNoCoste50,LesioniAllNoCoste66,LesioniAll50,LesioniAll100,LesioniAll130
QuantileConstantPiecewiseFuzzifier,-0.61,-0.33,-0.62,-0.62,-0.33,-0.33,-0.33
CrispFuzzifier,-0.46,-0.45,-0.46,-0.46,-0.46,-0.46,-0.46
ExponentialFuzzifier,-0.45,-0.45,-0.45,-0.45,-0.46,-0.46,0.46
LinearFuzzifier,-0.57,-0.45,-0.54,-0.54,-0.46,-0.46,-0.46
QuantileLinearPiecewiseFuzzifier,-0.61,-0.33,-0.62,-0.62,-0.33,-0.33,-0.33


In [40]:
for c,l in zip([50,100,130],['LesioniAll50_0','LesioniAll100_0','LesioniAll130_0']):
    dim_red = PCA(n_components=c)
    tnc = incidenti_fuzzifier_table(dataset,[columns_lesioni_all],fuzzifiers_class,FuzzyInductor,mu0,params_grid,3,3,logger=f_logger,dim_reduction=dim_red,file_json='json_result/fuzzifiers_all0.json',labels=[l])
    tnc.to_json('json_result/fuzzifiers_all0.json')

In [39]:
tnc0 = pd.read_json('json_result/fuzzifiers_all0.json')
tnc0

Unnamed: 0,LesioniAllNoCoste20_0,LesioniAllNoCoste40_0,LesioniAllNoCoste50_0,LesioniAllNoCoste66_0,LesioniAll50_0,LesioniAll100_0,LesioniAll130_0
QuantileConstantPiecewiseFuzzifier,-0.43,-0.42,-0.42,-0.42,-0.29,-0.29,-0.29
CrispFuzzifier,-0.54,-0.54,-0.54,-0.54,-0.48,-0.54,-0.54
ExponentialFuzzifier,-0.29,-0.28,-0.27,-0.27,-0.51,-0.54,-0.54
LinearFuzzifier,-0.3,-0.29,-0.28,-0.28,-0.49,-0.54,-0.54
QuantileLinearPiecewiseFuzzifier,-0.43,-0.42,-0.42,-0.42,-0.29,-0.29,-0.29


## Defuzzification

In [24]:
scaling = StandardScaler()
dim_red = PCA(n_components=40)

In [25]:
values = dataset[columns_lesioni].values
values_std = scaling.fit_transform(values)
values_20d = dim_red.fit_transform(values_std)

classes = (1,0)

In [26]:
be1 = FuzzyInductor(fuzzifier=QuantileConstantPiecewiseFuzzifier,c=0.021544346900318846,k=GaussianKernel(.4))
be0 = FuzzyInductor(fuzzifier=ExponentialFuzzifier,c=0.021544346900318846,k=LinearKernel())
bes = [be1,be0]

In [23]:
results_df = pd.DataFrame()

In [28]:
ptrain = 0.7

In [29]:
perf_train, perf_test =best_estimator_holdout(bes,dataset.index,values_20d, mu1,
                                              classes,ptrain,classify,3)

100%|██████████| 100/100 [00:23<00:00,  4.42it/s]
100%|██████████| 100/100 [00:22<00:00,  4.49it/s]
100%|██████████| 100/100 [00:23<00:00,  4.50it/s]
100%|██████████| 100/100 [00:22<00:00,  4.35it/s]
100%|██████████| 100/100 [00:22<00:00,  4.48it/s]
100%|██████████| 100/100 [00:34<00:00,  3.61it/s]


In [30]:
results_df = results_df.append({'train_err': perf_train,'test_err': perf_test, 'train%': ptrain},ignore_index=True)

In [31]:
results_df

Unnamed: 0,test_err,train%,train_err
0,0.33,0.99,0.42
1,0.46,0.7,0.4
