In [0]:
# @title Preparation
#Drive Link materiale: https://drive.google.com/drive/folders/10T3u5qfomOPUeXfHPEzeA3GRiT-ZqgBk?usp=sharing
!pip install tensorflow-gpu==1.15
!pip install scikit-learn==0.20.1

!pip install -q keras-bert keras-rectified-adam
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [0]:
import tensorflow as tf
import keras
from keras_radam import RAdam
from keras_bert import get_custom_objects
import numpy as np
from keras_bert import Tokenizer
import pandas as pd
import tensorflow.keras.backend as K
import sys
from sklearn.metrics import classification_report
from google.colab import drive

Using TensorFlow backend.


In [0]:
# @title Constants
np.random.seed(42)
SEQ_LEN = 128
BATCH_SIZE = 8
EPOCHS = 5
LR = 1e-5

In [0]:
# @title Environment
import os
pretrained_path = '/content/drive/My Drive/codiesp/'
config_path = os.path.join(pretrained_path, 'config.json')
checkpoint_path = os.path.join(pretrained_path, 'model.ckpt-2000000')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [0]:
# @title Load Basic Model
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

# import python modules defined by BERT
from run_classifier import *
import modeling
import optimization
import tokenization

import codecs
from keras_bert import load_trained_model_from_checkpoint

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)





In [0]:
# @title Load Data
!pip install category_encoders==1.3.0
import joblib
import pandas as pd
from keras import Sequential
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import random
from keras.layers import Input
import keras
from keras.layers import Conv1D , Embedding
from keras.layers import Dropout
from keras.layers import MaxPool1D
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import GlobalMaxPool1D
from keras.layers import Bidirectional
import category_encoders as ce

from keras.callbacks import ModelCheckpoint
from keras_self_attention import SeqSelfAttention

def remove_symbol(s):
    s = s.replace(",", "")
    s = s.replace(".", "")
    s = s.replace(";", "")
    s = s.replace(":", "")
    s = s.replace("_", "")
    s = s.replace("+", "")
    s = s.replace("ª", "")
    s = s.replace("-", "")
    s = s.replace("<", "")
    s = s.replace(">", "")
    s = s.replace("!", "")
    s = s.replace("?", "")
    s = s.replace("(", "")
    s = s.replace(")", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    s = s.replace("'", "")
    s = s.replace("0", "")
    s = s.replace("1", "")
    s = s.replace("2", "")
    s = s.replace("3", "")
    s = s.replace("4", "")
    s = s.replace("5", "")
    s = s.replace("6", "")
    s = s.replace("7", "")
    s = s.replace("8", "")
    s = s.replace("9", "")
    s = s.replace("%", "")
    s = s.strip()
    s = s.lower()
    return s

# Rimozione delle stopword
def clar_text(text):
    t = remove_symbol(str(text).strip().lower())
    tokens = list(str(text).lower().split(" "))
    for z in range(0, len(stop_word)):
        if stop_word[z] in tokens:
            while stop_word[z] in tokens:
                tokens.remove(str(stop_word[z]))

    tt = ""
    for it in tokens:
      tt = tt +" "+it
    return tt

def _pad(input_ids, max_seq_len):
    x = []
    input_ids = input_ids[:min(len(input_ids), max_seq_len - 2)]
    input_ids = input_ids + [0] * (max_seq_len - len(input_ids))
    return np.array(input_ids)

#LOADING DATASET
df = pd.read_csv('/content/drive/My Drive/codiesp/Train_with_emptyclass.csv')
df = df[['Code', 'Desc']]
# df = df[pd.notnull(df['desc'])]
#print(df.head(10))
print(df.shape)

df.index = range(df.shape[0])
print("Parole: " + str(df['Desc'].apply(lambda x: len(x.split(' '))).sum()))  # ci sono circa 211456 parole

# rimozione SOLO dei simboli (nessuno stemming e nessuna rimozione delle stopword)
df['Desc'] = df['Desc'].apply(remove_symbol)
#print(df.head(10))

# Acquisizione delle stop word
file_stopw = open("/content/drive/My Drive/codiesp/stop_word.pck", "rb")
stop_word = pickle.load(file_stopw)
df['Desc'] = df['Desc'].apply(clar_text)

#suddivisione train_test
train, test = train_test_split(df, test_size=0.3, random_state=42)

#prepare class encoder
le = ce.OneHotEncoder(return_df=False, impute_missing=False, handle_unknown="ignore")
labels = le.fit(list(df['Code']))
mapa = [0,1]

labels_map = [0,1]
#i = 0
#for a in mapa:
#    labels_map.append(a)
#print(labels_map)

#Tokenization
#Inizialize the tokenizer
tokenizer = tokenization.FullTokenizer(vocab_path, do_lower_case=True)
indices_train = []
indices_test = []

for text in train['Desc']:
  tk = tokenizer.tokenize(text)
  tokens = ["[CLS]"] + tk + ["[SEP]"]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  token_ids = _pad(token_ids,SEQ_LEN)
  indices_train.append(token_ids)

for text in test['Desc']:
  tk = tokenizer.tokenize(text)
  tokens = ["[CLS]"] + tk + ["[SEP]"]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  token_ids = _pad(token_ids,SEQ_LEN)
  indices_test.append(token_ids)

indices_train = [indices_train, np.zeros_like(indices_train)]
indices_test= [indices_test, np.zeros_like(indices_test)]

train_labels = train['Code']
train_labes_indexes = []
for label in train_labels:
  if(label =='emp'):
    train_labes_indexes.append(0)
  else:
    train_labes_indexes.append(1)


(49299, 2)
Parole: 1600976



In [0]:
!pip install joblib
import joblib

labels_map = joblib.load('/content/drive/My Drive/codiesp/labels_map_bert_only_01.joblib')



In [0]:
bert = load_trained_model_from_checkpoint(
    config_file=config_path,
    checkpoint_file=checkpoint_path,
    training=True,
    trainable=True,
    seq_len=128
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
# @title Build Custom Model

inputs = bert.inputs[:2]
dense = bert.get_layer('NSP-Dense').output
dense1 = keras.layers.Dense(units=1000, activation='tanh') (dense)
outputs = keras.layers.Dense(units=2, activation='softmax')(dense1)
modelk = keras.models.Model(inputs, outputs)
modelk.compile(
    RAdam(lr=LR),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'],
)

In [0]:
# @title Initialize Variables
sess = K.get_session()
uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
init_op = tf.variables_initializer(
    [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
)
sess.run(init_op)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
# @title Fit

#filepath="/content/drive/My Drive/codiesp/bert_only_01.{epoch:05d}-{val_loss:.5f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_sparse_categorical_accuracy', verbose=1, save_best_only=True, mode='max')

#    checkpoint
#callbacks_list = [
#]

#modelk.fit(
#    indices_train,
#    train_labes_indexes,
#    epochs=50,
#    batch_size=32,
#    validation_split = 0.10,
#    callbacks=callbacks_list
#)

In [0]:
modelk.load_weights('/content/drive/My Drive/codiesp/bert_only_01.00003-0.14749.hdf5')

In [0]:
# @title Predict
predicts = modelk.predict(indices_test, verbose=True)
print(predicts[0])

[0.06539832 0.93460166]


In [0]:
res_encoded = []
for a in predicts:
    val = a.argmax()
    res_encoded.append(labels_map[val])

print(res_encoded)
#res_labels = le.inverse_transform(DataFrame.res_encoded)

test_labels = test['Code']
test_labes_indexes = []
for label in test_labels:
  if label == 'emp':
    test_labes_indexes.append(0)
  else:
    test_labes_indexes.append(1)



print('Testing accuracy %s' % accuracy_score(test_labes_indexes, res_encoded))
print('Testing F1 score: {}'.format(f1_score(test_labes_indexes, res_encoded, average='weighted')))


[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [0]:
from sklearn.metrics import classification_report

In [0]:
print(classification_report(test_labes_indexes, res_encoded,digits=5))

              precision    recall  f1-score   support

           0    0.97183   0.96224   0.96701     12263
           1    0.82515   0.86466   0.84444      2527

   micro avg    0.94557   0.94557   0.94557     14790
   macro avg    0.89849   0.91345   0.90573     14790
weighted avg    0.94677   0.94557   0.94607     14790



#Cap_Classifier

In [0]:
# Tokenizzazione e rimozione delle stopword
def tokenize_text(text):
    tokens = list(str(text).lower().split(" "))
    for z in range(0, len(stop_word)):
        if stop_word[z] in tokens:
            while stop_word[z] in tokens:
                tokens.remove(str(stop_word[z]))
    return tokens

In [0]:
#Estraggo elementi di cui si crede ci sia la patologia
left_to_classify = []
left_to_classify_labels = []
res_encoded_original = res_encoded.copy()

test_desc = np.array(test['Desc'])
test_bal = np.array(test['Code'])


i = 0
for item in res_encoded:
  if item == 1:
    left_to_classify.append(test_desc[i])
    left_to_classify_labels.append(test_bal[i])
  i = i+1

print(left_to_classify)

['  meses tratamiento análisis control mostró bilirrubina total  got  gpt  ggt  fa  pudiendo comenzar tratamiento quimioterapia segunda línea docetaxel elevación psa reaparición dolor columna dorsal hombro derecho', ' gestante  años grupo sanguíneo positivo antecedentes personales meningitis  años legrado embarazo molar gpa primera gestación  fumadora  cigarrillos/día', ' si bien podía discernir silueta renal superficie abollonada presentaba numerosas formaciones quísticas contenido seroso corte dichos quistes mostraban tamaño heterogéneo siendo mayores situados nivel cortical dando riñón aspecto esponja', ' acude urgencias «pérdida distorsión visión» ojo izquierdo oi varios días evolución', ' varón  años encontrado accidentado vía pública trauma cráneoencefálico fracturas extremidades superior inferior derechas costales enfisema subcutáneo intenso hematuria leve', ' hallazgos servicio cirugía vascular decide plantear tratamiento quirúrgico aneurisma', ' paciente  años exfumador antece

In [0]:
from sklearn.svm import SVC
from gensim.models import Doc2Vec
import warnings
warnings.filterwarnings("ignore")

left_to_classify_res= None

for i in range(0, 21):
   # model_dmm = Doc2Vec(dm=1, dm_mean=1, window=10, negative=5, min_count=1, workers=5, alpha=0.065,min_alpha=0.065)
    model_dmm = Doc2Vec.load('/content/drive/My Drive/codiesp/models_BinCaps/'+'Model_DMM_BinClassCap' + str(i) + '.bin')
    transf = lambda t: model_dmm.infer_vector(t, steps=20)
    dvc_vectors = np.array([transf(tokenize_text(xi)) for xi in left_to_classify]) 
    #print(dvc_vector)
    model = joblib.load('/content/drive/My Drive/codiesp/models_BinCaps/'+'Model_SVM_BinClassCap' + str(i) + '.bin')
    res = model.predict_log_proba(dvc_vectors)
    #print(res)
    extract = lambda t: [t[1]]
    arr_res = np.array([extract(xi) for xi in res])
    print(arr_res)
    #arr_res_sent.append(res[0][1])
    if left_to_classify_res is None:
      left_to_classify_res = arr_res
    else:
      left_to_classify_res = np.concatenate((left_to_classify_res,arr_res),axis=1)
      #print(left_to_classify_res)

print(left_to_classify_res)




[[-2.28386915]
 [-2.19852583]
 [-3.39471547]
 ...
 [-1.52255989]
 [-0.85613687]
 [-1.5916113 ]]
[[-6.94849047]
 [-0.74697768]
 [-0.80694967]
 ...
 [-5.87503612]
 [-3.19568734]
 [-8.91644705]]
[[-3.11720684]
 [-4.28103297]
 [-1.73447225]
 ...
 [-5.62987084]
 [-5.16291711]
 [-3.02359855]]
[[-1.64270981]
 [-1.52813535]
 [-2.90226799]
 ...
 [-4.44937641]
 [-1.34039783]
 [-2.37567661]]
[[-3.41869099]
 [-2.04281213]
 [-5.60649755]
 ...
 [-5.58108216]
 [-8.32319992]
 [-2.98177063]]
[[-3.1276761 ]
 [-2.63917141]
 [-2.62586585]
 ...
 [-5.97480304]
 [-3.32567869]
 [-3.05161533]]
[[-3.26790652]
 [-5.31397643]
 [-3.36844966]
 ...
 [-1.16645473]
 [-4.61452487]
 [-4.08808425]]
[[-5.73851722]
 [-6.145331  ]
 [-5.63490818]
 ...
 [-6.24058057]
 [-5.90198973]
 [-5.78261341]]
[[-3.03150003]
 [-2.95797392]
 [-2.40566521]
 ...
 [-6.21006608]
 [-4.05967073]
 [-2.48114954]]
[[-3.91970923]
 [-3.4236162 ]
 [-4.29307779]
 ...
 [-1.97311648]
 [-4.32016104]
 [-4.03103304]]
[[-1.66655746]
 [-3.92583463]
 [-1.36237

In [0]:
left_to_classify_res[0]

array([-2.28386915, -6.94849047, -3.11720684, -1.64270981, -3.41869099,
       -3.1276761 , -3.26790652, -5.73851722, -3.03150003, -3.91970923,
       -1.66655746, -4.02340629, -3.56657598, -1.37706503, -4.60422078,
       -7.04165891, -3.4108411 , -2.91590986, -3.43528895, -4.84837392,
       -3.28775479])

In [0]:
c = {}
p = {}
r = {}

for i in range(0,21):
  key = 'c'+str(i)
  value = []
  c[key] = value
  key = 'p'+str(i)
  value1 = []
  p[key] = value1
  key = 'r'+str(i)
  value2 = []
  r[key] = value2

print(c)
print(p)
print(r)


{'c0': [], 'c1': [], 'c2': [], 'c3': [], 'c4': [], 'c5': [], 'c6': [], 'c7': [], 'c8': [], 'c9': [], 'c10': [], 'c11': [], 'c12': [], 'c13': [], 'c14': [], 'c15': [], 'c16': [], 'c17': [], 'c18': [], 'c19': [], 'c20': []}
{'p0': [], 'p1': [], 'p2': [], 'p3': [], 'p4': [], 'p5': [], 'p6': [], 'p7': [], 'p8': [], 'p9': [], 'p10': [], 'p11': [], 'p12': [], 'p13': [], 'p14': [], 'p15': [], 'p16': [], 'p17': [], 'p18': [], 'p19': [], 'p20': []}
{'r0': [], 'r1': [], 'r2': [], 'r3': [], 'r4': [], 'r5': [], 'r6': [], 'r7': [], 'r8': [], 'r9': [], 'r10': [], 'r11': [], 'r12': [], 'r13': [], 'r14': [], 'r15': [], 'r16': [], 'r17': [], 'r18': [], 'r19': [], 'r20': []}


In [0]:
k = 0

for it in left_to_classify_res:
  index = it.argmax();
  
  data = c['c'+str(index)]
  data.append(left_to_classify[k]);
  c['c'+str(index)] = data

  data1 = p['p'+str(index)]
  data1.append(k);
  p['p'+str(index)] = data1

  k = k+1

print(c)
print(p)

{'c0': [' antecedentes destaca trata paciente exfumador hace  años hipertenso tratamiento enalapril dislipémico tratamiento simvastatina', ' cultivo pus obtenido acto operatorio drenaje absceso muslo crecen dos cepas staphilococo coagulasa negativo identificado staphylococcus epidermidis streptococcus alfa hemolítico', ' biopsia dio resultado adenocarcinoma', ' ag bacterianos orina negativo neumococo legionella', '  meses diagnóstico repitió tac mostró imágenes compatibles metástasis hepáticas obligó iniciar quimioterapia paliativa etopósido actualidad continúa recibiendo', ' obtienen serologías múltiples agentes toxoplasma rubeola lúes herpes simple   virus varicelazóster mycoplasma adenovirus parotiditis parvovirus sarampión coxsackie virus echo borrelia negativos', ' episodios realizó angiografía mesentérica apreciándose extravasación nivel fístula arteriovenosa realizando embolización coils tras presentó cuadro abdomen agudo persistencia sangrado realizándose laparotomía encontrand

#CLASSIFICATION pathologies with their numbers


In [0]:
t = joblib.load("/content/drive/My Drive/codiesp/lstm_fasttext__tokenizer_total.vec")
embedding_matrix = joblib.load('/content/drive/My Drive/codiesp/lstm_fasttext__embedding_matrix_medical_total.vec')
z = 0
for z in range(0,21):
  to_classify = c['c'+str(z)]
  #padding text
  encoded_left_to_classify = t.texts_to_sequences(to_classify)

  max_length = 64
  padded_left_to_classify = pad_sequences(encoded_left_to_classify, maxlen=max_length, padding='post')
  #print(padded_left_to_classify[76])

  le = joblib.load("/content/drive/My Drive/codiesp/modelsLE/"+str(z)+"_le.joblib")
  # define the model
  vocab_size = len(t.word_index) + 1
  input = Input(shape=(64,))
  m = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=64, trainable=False) (input)
  bi = Bidirectional(LSTM(64, activation ='tanh', return_sequences = True, dropout=0.3)) (m)

  aa = SeqSelfAttention(attention_activation='tanh') (bi)
  aa = Conv1D(128,5, activation ='relu' ) (aa)
  aa = MaxPool1D(2) (aa)
  aa = Dropout(0.2) (aa)

  added = keras.layers.Concatenate(axis=1)([aa,bi])

  ff = GlobalMaxPool1D() (added)
  ff = Dense(4000)(ff)
  ff = Dropout(0.3) (ff)
  ff =Dense(len(le.category_mapping[0]['mapping']), activation='softmax') (ff)

  modelz = keras.models.Model(inputs=[input], outputs=[ff])

  #modelz.summary(line_length=100)
  modelz.compile (loss='categorical_crossentropy' , optimizer='adam' , metrics=[ 'accuracy'] )
  modelz.load_weights('/content/drive/My Drive/codiesp/modelsCaps/'+str(z)+'.hdf5')
  #predict
  res = modelz.predict(padded_left_to_classify)

  map = le.category_mapping[0]['mapping']
  #print(map)

  labels_map = []
  i = 0
  for a,b in map:
      labels_map.append(a)

  res_encoded_ins = []
  for a in res:
      val = a.argmax()
      res_encoded_ins.append(labels_map[val])

  print(res_encoded_ins)
  r['r'+str(z)] = res_encoded_ins

print(r)




['b18.2', 'b95.7', 'b45.2', 'a49.1', 'a41.9', 'b06.9', 'a41.9', 'a63.0', 'a15.9', 'b59', 'b95.62', 'b17.9', 'b59', 'a41.9', 'b67.90', 'b37.81', 'b19.20', 'a01.4', 'b67.90', 'b02.29', 'b02.29', 'a41.9', 'b59', 'a15.0', 'b17.9', 'b59', 'b02.29', 'a15.9', 'b18.2', 'b95.61', 'a41.9', 'b37.9', 'b18.2', 'b96.20', 'b18.2', 'a15.4', 'a41.9', 'b45.2', 'a41.02', 'b25.9', 'a41.9', 'b00.9', 'a63.0', 'b30.0', 'b25.9', 'a41.02', 'b37.81', 'a63.0', 'a01.4', 'a15.9', 'b37.0', 'a41.9', 'b06.9', 'b95.62', 'a41.9', 'b59', 'a41.02', 'b25.9', 'a15.4', 'b67.99', 'a18.4', 'b96.20', 'a41.02', 'b19.10', 'b18.2', 'b19.20', 'a63.0', 'b18.2', 'b58.9', 'b96.20', 'a41.9', 'b95.62', 'a41.9', 'b99.9', 'a63.0', 'a41.9', 'a41.9', 'b95.4', 'b59', 'b59', 'a41.9', 'b95.61', 'b97.0', 'a41.9', 'a41.02', 'b06.9', 'a63.0', 'a18.12', 'b37.9', 'a41.9', 'b19.20', 'b01.9', 'a41.9', 'b25.9', 'b02.29', 'a01.4', 'a41.9', 'b58.9', 'a63.0', 'b44.9', 'a63.0', 'a41.02', 'b59', 'b02.29', 'b25.9', 'b37.9', 'a63.0', 'a41.9', 'b96.3', 'b99

In [0]:
#joblib.dump(res_encoded_original,'/content/drive/My Drive/codiesp/task1/res_encoded.joblib')
#joblib.dump(c,'/content/drive/My Drive/codiesp/task1/c.joblib')
#joblib.dump(p,'/content/drive/My Drive/codiesp/task1/p.joblib')
#joblib.dump(r,'/content/drive/My Drive/codiesp/task1/r.joblib')

In [0]:
#import joblib
#res_encoded = joblib.load('/content/drive/My Drive/codiesp/task1/res_encoded.joblib')
#c = joblib.load('/content/drive/My Drive/codiesp/task1/c.joblib')
#p= joblib.load('/content/drive/My Drive/codiesp/task1/p.joblib')
#r = joblib.load('/content/drive/My Drive/codiesp/task1/r.joblib')

#Merge back the results


In [0]:
print(r)
from collections import OrderedDict
dic = {}
for i in range(0,21):
  pos = p['p'+str(i)]
  res = np.array(r['r'+str(i)])

  for k in range(0,len(res)):
    dic[str(pos[k])]= res[k]
print(dic)
od=OrderedDict(dic.items())
print(od)

{'r0': ['b18.2', 'b95.7', 'b45.2', 'a49.1', 'a41.9', 'b06.9', 'a41.9', 'a63.0', 'a15.9', 'b59', 'b95.62', 'b17.9', 'b59', 'a41.9', 'b67.90', 'b37.81', 'b19.20', 'a01.4', 'b67.90', 'b02.29', 'b02.29', 'a41.9', 'b59', 'a15.0', 'b17.9', 'b59', 'b02.29', 'a15.9', 'b18.2', 'b95.61', 'a41.9', 'b37.9', 'b18.2', 'b96.20', 'b18.2', 'a15.4', 'a41.9', 'b45.2', 'a41.02', 'b25.9', 'a41.9', 'b00.9', 'a63.0', 'b30.0', 'b25.9', 'a41.02', 'b37.81', 'a63.0', 'a01.4', 'a15.9', 'b37.0', 'a41.9', 'b06.9', 'b95.62', 'a41.9', 'b59', 'a41.02', 'b25.9', 'a15.4', 'b67.99', 'a18.4', 'b96.20', 'a41.02', 'b19.10', 'b18.2', 'b19.20', 'a63.0', 'b18.2', 'b58.9', 'b96.20', 'a41.9', 'b95.62', 'a41.9', 'b99.9', 'a63.0', 'a41.9', 'a41.9', 'b95.4', 'b59', 'b59', 'a41.9', 'b95.61', 'b97.0', 'a41.9', 'a41.02', 'b06.9', 'a63.0', 'a18.12', 'b37.9', 'a41.9', 'b19.20', 'b01.9', 'a41.9', 'b25.9', 'b02.29', 'a01.4', 'a41.9', 'b58.9', 'a63.0', 'b44.9', 'a63.0', 'a41.02', 'b59', 'b02.29', 'b25.9', 'b37.9', 'a63.0', 'a41.9', 'b96.3'

In [0]:
final_res = []

i = 0
for item in res_encoded_original:
  if item == 0:
    final_res.append('emp')
  else:
    final_res.append(od[str(i)])
    i = i+1


In [0]:
print('Testing accuracy %s' % accuracy_score(test_bal, final_res))
print('Testing F1 score: {}'.format(f1_score(test_bal, final_res, average='weighted')))
print(classification_report(test_bal, final_res,digits=5))

Testing accuracy 0.8286004056795132
Testing F1 score: 0.827720672458949
              precision    recall  f1-score   support

      B96.20    0.00000   0.00000   0.00000         1
       D40.8    0.00000   0.00000   0.00000         0
      D41.02    0.00000   0.00000   0.00000         2
      D44.10    0.00000   0.00000   0.00000         0
       D44.2    0.00000   0.00000   0.00000         1
       D49.0    0.00000   0.00000   0.00000         1
     D49.511    0.00000   0.00000   0.00000         1
     D49.519    0.00000   0.00000   0.00000         2
      E83.51    0.00000   0.00000   0.00000         1
      H35.89    0.00000   0.00000   0.00000         1
     H44.002    0.00000   0.00000   0.00000         2
      H55.01    0.00000   0.00000   0.00000         1
       I35.8    0.00000   0.00000   0.00000         1
       K76.9    0.00000   0.00000   0.00000         1
     M84.30X    0.00000   0.00000   0.00000         1
      M85.80    0.00000   0.00000   0.00000         1
      N48

In [0]:
print(final_res)

['n28.9', 'emp', 'emp', 'r40.4', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'd49.7', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'r55', 'emp', 'n23', 'emp', 'emp', 'emp', 'emp', 'l02.211', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'r06.00', 'emp', 'emp', 'd49.0', 'emp', 'emp', 'r06.89', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'c67.7', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'l92.9', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'd49.0', 'emp', 'emp', 'b18.2', 'emp', 'emp', 'n26.9', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'b95.7', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'b45.2', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'emp', 'r59.0', 'r27.0', 'emp', 'emp', 'emp', 'emp', 'emp', 'l03.115', 'r60.9', 'emp', 'emp', 'a49.