In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import json
pd.options.display.float_format = "{:,.2f}".format

from experiments import *
from defuzzification import *

import sys

sys.path.insert(1, '../fuzzylearn/')

from fuzzylearn import *
from fuzzylearn.fuzzifiers import LinearFuzzifier, CrispFuzzifier,ExponentialFuzzifier,QuantileLinearPiecewiseFuzzifier, QuantileConstantPiecewiseFuzzifier
from fuzzylearn.kernel import GaussianKernel, LinearKernel, HyperbolicKernel, PolynomialKernel, HomogeneousPolynomialKernel
from fuzzylearn import solve_optimization_gurobi

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer, RobustScaler, PowerTransformer, Normalizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
from sklearn.model_selection import GridSearchCV, cross_val_score

import datetime as dt

  import numba.targets


In [2]:
import logging

# create logger
f_logger = logging.getLogger(__name__)
f_logger.setLevel(logging.INFO)

# create console handler and set level to debug
file = logging.FileHandler('esperimenti_dxsx.log')
file.setLevel(logging.INFO)

# create formatter
formatter = logging.Formatter('%(asctime)s -%(message)s')

# add formatter to ch
file.setFormatter(formatter)

# add ch to logger
f_logger.addHandler(file)

filejson = 'json_result/exp_dxsxunique.json'

In [3]:
dataset = pd.read_excel("dataset/DATABASE_UNITO_modificato.xlsx")
dataset = dataset.set_index("VERBALE")
dataset.DATA = dataset.DATA.apply(lambda d: (d - dt.datetime(1970,1,1)).days)
dataset.head()

Unnamed: 0_level_0,DATA,SESSO,ANNI,PESO,ALTEZZA,BMI,Mezzo,Testa:Neurocranio,Testa:Splancnocranio,Testa:Telencefalo,...,II raggio sx.1,III raggio sx.1,IV raggio sx.1,V raggio sx.1,Art. coxo-femorale dx,Art. coxo-femorale sx,Rotula o Ginocchio dx,Rotula o Ginocchio sx,Caviglia dx,Caviglia sx
VERBALE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
85567,10893,0,81,84.0,1.75,27.43,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
85829,10970,1,69,69.0,1.62,26.29,1,4,4,4,...,0,0,0,0,0,0,0,0,0,0
85977,11026,1,71,67.0,1.55,27.89,1,2,0,1,...,0,0,0,0,0,0,0,0,0,0
86220,11122,1,54,60.0,1.59,23.73,1,4,0,0,...,0,0,0,0,0,0,0,0,0,0
86247,11130,1,78,69.0,1.67,24.74,1,2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
mu1 = dataset['Mezzo']
mu0 = [0 if mu else 1 for mu in mu1]

ordini = [.01,.1,1,10,100]

param_grid = {
    'c' : ordini,
    'k' : [LinearKernel()] + [HyperbolicKernel(1,o) for o in ordini] + [GaussianKernel(s) for s in ordini]
}

# Suddivisione feature

In [5]:
anagrafica = list(dataset.columns)[:6]

featuresx = [f for f in list(dataset.columns) if 'sx' in f]
featuredx = [f for f in list(dataset.columns) if 'dx' in f]

totali = [f for f in list(dataset.columns) if 'tot' in f.lower() and f != 'Totale']

featureuq = [f for f in list(dataset.columns)\
                  if f not in featuresx \
                  and f not in featuredx \
                 and f not in anagrafica \
                 and f not in totali \
                 and f != 'Mezzo']

### Alcuni Controlli

In [6]:
len(featuresx) == len(featuredx)

True

In [7]:
len(anagrafica) + len(featuresx) + len(featuredx) + len(totali) + len(featureuq) == len(dataset.columns)-1

True

### Feature selection anagrafica

In [8]:
anagsel = [f for f in anagrafica if f != 'DATA' and f != 'BMI']

# Esperimenti

### Totali

totali semplici

In [9]:
l = 'Totali'
ftp = incidenti_fuzzifier_table(dataset,[totali],fuzzifiers_class,FuzzyInductor,\
                                mu1,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])
ftp.to_json(filejson)

totali PCA

In [10]:
lab = ['TotaliPCA5','TotaliPCA10','TotaliPCA15']
for c,l in zip([5,10,15],lab):
    dim_red = PCA(n_components=c)
    ftp = incidenti_fuzzifier_table(dataset,[totali],fuzzifiers_class,FuzzyInductor,\
                                mu1,param_grid,3,3,logger=f_logger,\
                                dim_reduction=dim_red,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])
    ftp.to_json(filejson)

totali + anagrafica

In [11]:
l = 'TotaliAnagrafica'
totanag = anagsel + totali
ftp = incidenti_fuzzifier_table(dataset,[totanag],fuzzifiers_class,FuzzyInductor,\
                                mu1,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

Insieme fuzzy mezzo leggero

In [12]:
l = 'TotaliAnagrafica_0'
ftp = incidenti_fuzzifier_table(dataset,[totanag],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

## Feature uniche

semplici

In [13]:
l = 'Uniche_0'
ftp = incidenti_fuzzifier_table(dataset,[featureuq],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

PCA

In [14]:
labelsPCA = ['UnichePCA10_0','UnichePCA30_0','UnichePCA50_0']
for c,l in zip([10,30,50],labelsPCA):
    dim_red = PCA(n_components=c)
    ftp = incidenti_fuzzifier_table(dataset,[featureuq],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=dim_red,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

anagrafica

In [15]:
l = 'UnicheAnagrafica_0'
uqanag = anagsel + featureuq
ftp = incidenti_fuzzifier_table(dataset,[uqanag],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

## dx

In [16]:
coste = [f for f in featuredx if 'costa' in f]
cranio = [f for f in featuredx if 'cranica' in f or 'Splancnoc' in f]
lin = [f for f in featuredx if 'Linea' in f]
linee = lin[0:7]
metacarpo = [f for f in featuredx if 'metacarpo' in f]
rag = [f for f in featuredx if 'raggio' in f]
raggio = rag[0:5]
metatarso = [f for f in featuredx if 'metatarso' in f]
uniquedx = [f for f in featuredx 
            if f not in coste 
            and f not in cranio
           and f not in lin
           and f not in metacarpo
           and f not in rag
           and f not in metatarso]

In [17]:
len(featuredx) \
== \
(len(coste) + len(cranio) + len(lin) + len(metacarpo) + len(rag) + len(metatarso) + len(uniquedx))

True

Nessuno ha subito danni al metatarso

In [18]:
from functools import reduce,partial

In [19]:
dfdx = dataset[uniquedx]
for columns,label in zip([coste,cranio,linee,metacarpo,raggio],['Coste','Cranio','Linee','Metacarpo','Raggio']):
    values = list(map(lambda l: reduce(lambda x,y : x+y,l),dataset[columns].values.tolist()))
    dfdx[label] = values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


semplici

In [20]:
l = 'Dx_0'
ftp = incidenti_fuzzifier_table(dfdx,[list(dfdx.columns)],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

PCA

In [21]:
dxPCA = ['DxPCA5_0','DxPCA15_0','DxPCA25_0']
for c,l in zip([5,15,25],dxPCA):
    dim_red = PCA(n_components=c)
    ftp = incidenti_fuzzifier_table(dfdx,[list(dfdx.columns)],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=dim_red,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

anagrafica

In [22]:
dfdxanag = dataset[anagsel].join(dfdx)

l = 'DxAnagrafica_0'
ftp = incidenti_fuzzifier_table(dfdxanag,[list(dfdxanag.columns)],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

## sx

In [23]:
coste = [f for f in featuresx if 'costa' in f]
cranio = [f for f in featuresx if 'cranica' in f or 'Splancnoc' in f]
lin = [f for f in featuresx if 'Linea' in f]
linee = lin[0:7]
metacarpo = [f for f in featuresx if 'metacarpo' in f]
rag = [f for f in featuresx if 'raggio' in f]
raggio = rag[0:5]
metatarso = [f for f in featuresx if 'metatarso' in f]
uniquesx = [f for f in featuresx 
            if f not in coste 
            and f not in cranio
           and f not in lin
           and f not in metacarpo
           and f not in rag
           and f not in metatarso]

In [24]:
len(featuresx) \
== \
(len(coste) + len(cranio) + len(lin) + len(metacarpo) + len(rag) + len(metatarso) + len(uniquesx))

True

In [25]:
dfsx = dataset[uniquesx]
for columns,label in zip([coste,cranio,linee,metacarpo,raggio],['Coste','Cranio','Linee','Metacarpo','Raggio']):
    values = list(map(lambda l: reduce(lambda x,y : x+y,l),dataset[columns].values.tolist()))
    dfsx[label] = values

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [26]:
l = 'Sx_0'
ftp = incidenti_fuzzifier_table(dfsx,[list(dfsx.columns)],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

In [27]:
dfsxanag = dataset[anagsel].join(dfsx)

l = 'SxAnagrafica_0'
ftp = incidenti_fuzzifier_table(dfsxanag,[list(dfsxanag.columns)],fuzzifiers_class,FuzzyInductor,\
                                mu0,param_grid,3,3,logger=f_logger,\
                                dim_reduction=None,scaling=StandardScaler(),\
                                file_json=filejson,labels=[l])

In [28]:
ftp

Unnamed: 0,Totali,TotaliPCA5,TotaliPCA10,TotaliPCA15,TotaliAnagrafica,TotaliAnagrafica_0,Uniche_0,UnichePCA10_0,UnichePCA30_0,UnichePCA50_0,UnicheAnagrafica_0,Dx_0,DxPCA5_0,DxPCA15_0,DxPCA25_0,DxAnagrafica_0,Sx_0,SxAnagrafica_0
QuantileConstantPiecewiseFuzzifier,-0.42,-0.38,-0.41,-0.41,-0.38,-0.38,-0.3,-0.28,-0.31,-0.32,-0.31,-0.3,-0.36,-0.37,-0.37,-0.29,-0.36,-0.32
CrispFuzzifier,-0.42,-0.41,-0.42,-0.42,-0.35,-0.54,-0.34,-0.33,-0.32,-0.34,-0.38,-0.39,-0.41,-0.43,-0.39,-0.38,-0.43,-0.45
ExponentialFuzzifier,-0.29,-0.29,-0.29,-0.29,-0.29,-0.26,-0.25,-0.25,-0.25,-0.25,-0.25,-0.31,-0.32,-0.3,-0.31,-0.3,-0.29,-0.29
LinearFuzzifier,-0.3,-0.29,-0.3,-0.3,-0.29,-0.26,-0.25,-0.25,-0.25,-0.25,-0.24,-0.28,-0.28,-0.3,-0.28,-0.27,-0.29,-0.29
QuantileLinearPiecewiseFuzzifier,-0.42,-0.38,-0.4,-0.4,-0.39,-0.36,-0.31,-0.28,-0.3,-0.3,-0.32,-0.31,-0.34,-0.37,-0.36,-0.32,-0.34,-0.33


# Defuzzificazione

In [33]:
be1 = FuzzyInductor(c=.1,fuzzifier=ExponentialFuzzifier,k=HyperbolicKernel(1,.01))
be0 = FuzzyInductor(c=.1,fuzzifier=LinearFuzzifier,k=HyperbolicKernel(1,1))
bes = [be1,be0]

classes = (1,0)

scaling = StandardScaler()

variables = uqanag

values = dataset[variables].values
values_std = scaling.fit_transform(values)
values_20d = values_std


In [34]:
results_df = pd.DataFrame()

In [40]:
perf_train, perf_test =best_estimator_holdout(bes,dataset.index,values_20d, mu1.values,
                                              classes,0.7,classify,3)

100%|██████████| 100/100 [00:22<00:00,  4.38it/s]
100%|██████████| 100/100 [00:21<00:00,  4.59it/s]
100%|██████████| 100/100 [00:21<00:00,  4.57it/s]
100%|██████████| 100/100 [00:22<00:00,  4.51it/s]
100%|██████████| 100/100 [00:22<00:00,  4.52it/s]
100%|██████████| 100/100 [00:21<00:00,  4.56it/s]


In [42]:
results_df = results_df.append({'esperimento': 'UnicheAnagrafica','defuzz': 'max','train_err': perf_train,\
                               'test_err': perf_test},ignore_index=True)

In [48]:
partial(alpha_cut,3,0)

functools.partial(<function alpha_cut at 0x7f5a4b41fc80>, 3, 0)

In [49]:
alpha = np.arange(0,1.1,.1)
for a in alpha:
    perf_train, perf_test = best_estimator_holdout(bes,dataset.index,values_20d, mu1.values,
                                              classes,0.7,partial(alpha_cut,a,0),5)
    results_df = results_df.append({'esperimento': 'UnicheAnagrafica',\
                                    'defuzz': 'alpha_cut({})'.format(a),\
                                   'train_err': perf_train,\
                                   'test_err': perf_test},\
                                  ignore_index=True)

100%|██████████| 100/100 [00:21<00:00,  4.60it/s]
100%|██████████| 100/100 [00:21<00:00,  4.63it/s]
100%|██████████| 100/100 [00:21<00:00,  4.55it/s]
100%|██████████| 100/100 [00:21<00:00,  4.76it/s]
100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
100%|██████████| 100/100 [00:21<00:00,  4.69it/s]
100%|██████████| 100/100 [00:21<00:00,  4.76it/s]
100%|██████████| 100/100 [00:24<00:00,  4.15it/s]
100%|██████████| 100/100 [00:32<00:00,  3.03it/s]
100%|██████████| 100/100 [00:24<00:00,  4.01it/s]
100%|██████████| 100/100 [00:22<00:00,  4.54it/s]
100%|██████████| 100/100 [00:20<00:00,  4.81it/s]
100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
100%|██████████| 100/100 [00:20<00:00,  4.83it/s]
100%|██████████| 100/100 [00:20<00:00,  5.00it/s]
100%|██████████| 100/100 [00:20<00:00,  4.88it/s]
100%|██████████| 100/100 [00:20<00:00,  4.84it/s]
100%|██████████| 100/100 [00:19<00:00,  5.07it/s]
100%|██████████| 100/100 [00:20<00:00,  4.82it/s]
100%|██████████| 100/100 [00:19<00:00,  5.02it/s]


In [50]:
results_df

Unnamed: 0,defuzz,esperimento,test_err,train_err
0,max,UnicheAnagrafica,0.33,0.27
1,alpha_cut(0.0),UnicheAnagrafica,0.42,0.48
2,alpha_cut(0.1),UnicheAnagrafica,0.47,0.45
3,alpha_cut(0.2),UnicheAnagrafica,0.41,0.43
4,alpha_cut(0.30000000000000004),UnicheAnagrafica,0.42,0.33
5,alpha_cut(0.4),UnicheAnagrafica,0.35,0.34
6,alpha_cut(0.5),UnicheAnagrafica,0.34,0.35
7,alpha_cut(0.6000000000000001),UnicheAnagrafica,0.38,0.36
8,alpha_cut(0.7000000000000001),UnicheAnagrafica,0.46,0.43
9,alpha_cut(0.8),UnicheAnagrafica,0.63,0.5
