In [1]:
import pandas as pd
from IPython.display import display, HTML
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path

plt.rcParams['figure.figsize'] = [10, 5]

config_dirs = open("prefixo_dados.txt").readlines() 
DIR_PREFIXO = Path(config_dirs[0].strip())


DIRETORIO_PRINCIPAL =  DIR_PREFIXO / "Projeto_PA" / "Projeto_PA_validado"
DIRETORIO_DATASET = DIR_PREFIXO / "Projeto_PA"



In [2]:
# Datset com todos os campos: metadados, medidas e estatísticas
df = pd.read_csv(DIRETORIO_PRINCIPAL / "padrao_doenca" / "classificados_por_padroes_de_doenca_estatistica.csv", sep=";")

#display(HTML(df.to_html()))

In [3]:
# drop all averages, standard deviations, and classes (types of diseases)
df_medidas_doencas = df.iloc[:,1:172]
#display(HTML(df_medidas_doencas.to_html()))

In [4]:
df_medidas_doencas_sem_nan = df_medidas_doencas.fillna(0)

In [5]:
print(df_medidas_doencas_sem_nan.columns)

Index(['data_nascimento', 'genero', 'idade', 'peso', 'altura', 'data_exame',
       'sist 9:00', 'sist 9:15', 'sist 9:30', 'sist 9:45',
       ...
       'diast 6:45', 'diast 7:00', 'diast 7:15', 'diast 7:30', 'diast 7:45',
       'diast 8:00', 'diast 8:15', 'diast 8:30', 'diast 8:45', 'whitecoat'],
      dtype='object', length=171)


In [6]:
# transform Gender M and F to binary 
# cleanup_nums = {"Morning Surge": {"NaN": False}}

# df_medidas_doencas_sem_nan.replace(cleanup_nums, inplace=True)
#display(HTML(df_medidas_doencas_sem_nan.to_html()))

In [7]:
# transform Gender M and F to binary 
cleanup_nums = {"genero": {"M": True, "F": False}}

df_medidas_doencas_sem_nan.replace(cleanup_nums, inplace=True)
df_medidas_doencas_sem_nan = df_medidas_doencas_sem_nan.astype(int)

ValueError: invalid literal for int() with base 10: '1923-10-05'

In [None]:
df_X = df_medidas_doencas_sem_nan.iloc[:,:161]
#display(HTML(df_X.to_html()))

In [None]:
df_Y = df_medidas_doencas_sem_nan.iloc[:,161:]
#display(HTML(df_Y.to_html()))

In [None]:
# split between train and test with test_size of 0.3 (30%)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_Y, test_size=0.3, random_state=42)

print(X_train.shape[0])
display(X_train.head())

print(X_test.shape[0])
display(X_test.head())

In [None]:
print(np.unique(df_Y.values, axis=0).shape[0])
display(np.unique(df_Y.values, axis=0))

In [None]:
# print(np.unique(np.argmax(df_Y.values, axis=1)))
listm = np.unique(df_Y.values, axis=0).tolist()
print(listm)
print(listm[0])

print([''.join(str(e) for e in i) for i in listm])

print([int(''.join(str(e) for e in i),2) for i in listm])

In [None]:
print(np.unique([int(''.join(str(e) for e in i),2) for i in df_Y.values.tolist()], return_counts=True))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
# from sklearn.tree import export_graphviz
# from sklearn.externals.six import StringIO  
import warnings

In [None]:
warnings.filterwarnings('ignore')

Ms = np.arange(2,10)
mean_acc = list()

for k in Ms:
    
    #create classifier
    drugTree = DecisionTreeClassifier(criterion="entropy", random_state=123, max_depth=k)
    
    #create pipeline with scaler and classifier
    pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', drugTree)])
    
    # perform cv=10 with F1_scored weighted
    scores = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='f1_weighted')
    mean_acc.append(np.mean(scores))

# mean_acc

In [None]:
plt.plot(Ms,mean_acc,'g')
plt.ylabel('F1 score')
plt.xlabel('max depth')
plt.tight_layout()
plt.show()

In [None]:
print( "The best accuracy was with", max(mean_acc), "with max_depth=", Ms[np.argmax(mean_acc)]) 

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X_train)

In [None]:
best_drugTree = DecisionTreeClassifier(criterion="entropy", random_state=123, max_depth = Ms[np.argmax(mean_acc)] )
best_drugTree.fit(X_scaled,y_train)

In [None]:
predTree = best_drugTree.predict(X_scaled)

In [None]:
print('Train Accuracy: ', accuracy_score(y_train, predTree))
print('Train F1_score: ', f1_score(y_train, predTree, average='weighted'))

In [None]:
test_X_scaled= scaler.transform(X_test)
# test_X_scaled[0:5]

In [None]:
test_predTree  = best_drugTree.predict(test_X_scaled)
#print(test_predTree)

print('Test Accuracy: ', accuracy_score(y_test, test_predTree))

# print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
print('Test F1_score: ', f1_score(y_test, test_predTree, average='weighted'))

# multi-output classifier - DecisionTreeClassifier

In [None]:
#from sklearn.multioutput import MultiOutputClassifier

In [None]:
#best_tree = DecisionTreeClassifier(criterion="entropy", random_state=123, max_depth = Ms[np.argmax(mean_acc)] )
#clf = MultiOutputClassifier(best_tree).fit(X_scaled, y_train)

In [None]:
#test_predTree_m  = clf.predict(test_X_scaled)
#print(test_predTree)

# print('Test Accuracy: ', accuracy_score(y_test, test_predTree_m))

# # print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
# print('Test F1_score: ', f1_score(y_test, test_predTree_m, average='weighted'))

# multi-output classifier - RandomForestClassifier

In [None]:
# warnings.filterwarnings('ignore')

# Ms = np.arange(2,20)
# mean_acc = list()

# for k in Ms:
    
#     #create classifier
#     clf_forest = RandomForestClassifier(criterion="entropy", random_state=123, max_depth=k)
    
#     #create pipeline with scaler and classifier
#     pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', clf_forest)])
    
#     # perform cv=10 with F1_scored weighted
#     scores = cross_val_score(pipeline, X_train, y_train, cv = 5, scoring='f1_weighted')
#     mean_acc.append(np.mean(scores))

# mean_acc

In [None]:
# plt.plot(Ms,mean_acc,'g')
# plt.ylabel('F1 score')
# plt.xlabel('Max leaf nodes')
# plt.tight_layout()
# plt.show()

In [None]:
# print("The best accuracy was with", max(mean_acc), "with max_depth=", Ms[np.argmax(mean_acc)]) 

In [None]:
# best_forest = RandomForestClassifier(criterion="entropy", random_state=123, max_depth = Ms[np.argmax(mean_acc)] )
# best_forest.fit(X_scaled,y_train)

In [None]:
# test_pred_m  = best_forest.predict(test_X_scaled)
# print(test_predTree)

# print('Test Accuracy: ', accuracy_score(y_test, test_pred_m))

# # print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
# print('Test F1_score: ', f1_score(y_test, test_pred_m, average='weighted'))

# classifier per each class - DecisionTreeClassifier

In [None]:
n_classes = y_train.shape[1]
classes = y_train.columns.values.tolist()
print(classes)
print(n_classes)
print(y_train.iloc[:,0].values)

In [None]:
for i in range(n_classes):
    print('disease:'+str(classes[i]))
    y = y_train.iloc[:,i].values
    print('ones: ',(y == 1).sum())
    print('zeros:',(y == 0).sum())
    print('------')

In [None]:
for i in range(n_classes):
    print('disease:'+str(classes[i]))
    y = y_test.iloc[:,i].values
    print('ones: ',(y == 1).sum())
    print('zeros:',(y == 0).sum())
    print('------')

In [None]:
mean_acc_estimators = list()
Ms = np.arange(2,100)

for i in range(n_classes):
    mean_acc = list()
    y = y_train.iloc[:,i].values
    for k in Ms:

        #create classifier
        tree = DecisionTreeClassifier(criterion="entropy", random_state=123, max_depth=k)

        #create pipeline with scaler and classifier
        pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', tree)])

        # perform cv=10 with F1_scored weighted
        scores = cross_val_score(pipeline, X_train, y, cv = 5, scoring='f1_weighted')
        mean_acc.append(np.mean(scores))
    mean_acc_estimators.append(mean_acc)
# mean_acc

In [None]:
plt.figure(figsize=(20,10))
for i in range(len(mean_acc_estimators)):
    plt.plot(Ms, mean_acc_estimators[i], label=classes[i])   
plt.ylabel('F1 score')
plt.xlabel('Max depth')
plt.legend(loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
for i in range(n_classes):
    print( "The best accuracy was with: "+str(i), max(mean_acc_estimators[i]), "with max_depth=", Ms[np.argmax(mean_acc_estimators[i])]) 

In [None]:
estimators = list()

for i in range(n_classes):
    best_Tree = DecisionTreeClassifier(criterion="entropy", random_state=123, max_depth = Ms[np.argmax(mean_acc_estimators[i])] )
    y = y_train.iloc[:,i].values
    best_Tree.fit(X_scaled,y)
    estimators.append(best_Tree)

In [None]:
for i in range(n_classes):

    test_pred_m  = estimators[i].predict(test_X_scaled)
    print('Test Accuracy: '+str(classes[i])+' -- ', accuracy_score(y_test.iloc[:,i], test_pred_m))

    # print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
    print('Test F1_score: '+str(classes[i])+' -- ', f1_score(y_test.iloc[:,i], test_pred_m, average='weighted'))
    print('------------------------')

# classifier per each class - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
mean_acc_estimators_LR = list()
Cks = np.linspace(1e-3,100, 50)

for i in range(n_classes):
    mean_acc = list()
    y = y_train.iloc[:,i].values

    for Ck in Cks:

        #create pipeline with scaler and classifier
        LR = LogisticRegression(C=Ck, solver='liblinear')
        pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', LR)])

        # perform cv using F1_score weighted
        scores = cross_val_score(pipeline, X_train, y, cv = 5, scoring='f1_weighted')
        mean_acc.append(np.mean(scores))
    mean_acc_estimators_LR.append(mean_acc)    

In [None]:
print(len(mean_acc_estimators_LR))
print(Cks)

In [None]:
plt.figure(figsize=(20,10))
for i in range(len(mean_acc_estimators_LR)):
    plt.plot(Cks, mean_acc_estimators_LR[i], label=classes[i])    
plt.ylabel('F1 score')
plt.xlabel('Regularization parameter (C)')
plt.legend(loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
for i in range(n_classes):
    print( "The best accuracy was with: "+str(classes[i]), max(mean_acc_estimators_LR[i]), "with C=", Cks[np.argmax(mean_acc_estimators_LR[i])])

In [None]:
# estimators_LR = list()

# for i in range(n_classes):
#     best_LR = LogisticRegression(C=Cks[np.argmax(mean_acc_estimators_LR[i])], solver='liblinear')
#     y = y_train.iloc[:,i].values
#     best_LR.fit(X_scaled,y)
#     estimators_LR.append(best_LR)

In [None]:
# for i in range(n_classes):

#     test_pred_m  = estimators_LR[i].predict(test_X_scaled)
#     print('Test Accuracy: '+str(classes[i])+' -- ', accuracy_score(y_test.iloc[:,i], test_pred_m))

#     # print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
#     print('Test F1_score: '+str(classes[i])+' -- ', f1_score(y_test.iloc[:,i], test_pred_m, average='weighted'))
#     print('------------------------')

# classifier per each class - Random Forest

In [None]:
mean_acc_estimators_RF = list()
Ns = np.arange(2,50)

for i in range(n_classes):
    mean_acc = list()
    y = y_train.iloc[:,i].values

    for n in Ns:

        #create pipeline with scaler and classifier
        RF = RandomForestClassifier(n_estimators=n, criterion="entropy", random_state=123)
        pipeline = Pipeline([('scaler', StandardScaler()), ('estimator', RF)])

        # perform cv using F1_score weighted
        scores = cross_val_score(pipeline, X_train, y, cv = 5, scoring='f1_weighted')
        mean_acc.append(np.mean(scores))
    mean_acc_estimators_RF.append(mean_acc)    

In [None]:
plt.figure(figsize=(20,10))
for i in range(len(mean_acc_estimators_RF)):
    plt.plot(Ns, mean_acc_estimators_RF[i], label=classes[i])    
plt.ylabel('F1 score')
plt.xlabel('number of estimators (N)')
plt.legend(loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
for i in range(n_classes):
    print( "The best accuracy was with: "+str(classes[i]), max(mean_acc_estimators_RF[i]), "with N=", Ns[np.argmax(mean_acc_estimators_RF[i])])

In [None]:
estimators_RF = list()

for i in range(n_classes):
    best_RF = RandomForestClassifier(criterion="entropy", random_state=123, max_depth=Ns[np.argmax(mean_acc_estimators_RF[i])])
    y = y_train.iloc[:,i].values
    best_RF.fit(X_scaled,y)
    estimators_RF.append(best_RF)

In [None]:
test_pred = list()

for i in range(n_classes):

    test_pred_m  = estimators_RF[i].predict(test_X_scaled)
    print('Test Accuracy: '+str(classes[i])+' -- ', accuracy_score(y_test.iloc[:,i], test_pred_m))
    test_pred.append(test_pred_m.reshape(-1,1))
    

    # print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
    print('Test F1_score: '+str(classes[i])+' -- ', f1_score(y_test.iloc[:,i], test_pred_m, average='weighted'))
    print('------------------------')

In [None]:
print(test_pred[0])
len(test_pred[0])

In [None]:
test_predf = np.concatenate(tuple(test_pred), axis=1)
print(test_predf.shape)

In [None]:
test_predf

In [None]:
print('Test Accuracy: ', accuracy_score(y_test, test_predf))
    
# print('Test jaccard similarity score: ', jaccard_similarity_score(y_test, test_predTree))
print('Test F1_score: ', f1_score(y_test, test_predf, average='weighted'))