In [1]:
!pip install tensorflow-gpu==1.15
# @title Preparation
!pip install -q keras-bert keras-rectified-adam
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import tensorflow as tf
import keras
from keras_radam import RAdam
from keras_bert import get_custom_objects
import numpy as np
from tqdm import tqdm
from keras_bert import Tokenizer
import pandas as pd
import tensorflow.keras.backend as K
import sys
from sklearn.metrics import classification_report
from google.colab import drive

Using TensorFlow backend.


In [0]:
# @title Constants
np.random.seed(42)
SEQ_LEN = 128
BATCH_SIZE = 8
EPOCHS = 5
LR = 1e-5

In [0]:
# @title Environment
import os
pretrained_path = '/content/drive/My Drive/codiesp/'
config_path = os.path.join(pretrained_path, 'config.json')
checkpoint_path = os.path.join(pretrained_path, 'model.ckpt-2000000')
vocab_path = os.path.join(pretrained_path, 'vocab.txt')

In [5]:
# @title Load Basic Model
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

# import python modules defined by BERT
from run_classifier import *
import modeling
import optimization
import tokenization

import codecs
from keras_bert import load_trained_model_from_checkpoint

token_dict = {}
with codecs.open(vocab_path, 'r', 'utf8') as reader:
    for line in reader:
        token = line.strip()
        token_dict[token] = len(token_dict)





In [6]:
# @title Load Data
!pip install category_encoders==1.3.0
import joblib
import pandas as pd
from keras import Sequential
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
import random
from keras.layers import Input
import keras
from keras.layers import Conv1D , Embedding
from keras.layers import Dropout
from keras.layers import MaxPool1D
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import GlobalMaxPool1D
from keras.layers import Bidirectional
import category_encoders as ce

from keras.callbacks import ModelCheckpoint
from keras_self_attention import SeqSelfAttention

def remove_symbol(s):
    s = s.replace(",", "")
    s = s.replace(".", "")
    s = s.replace(";", "")
    s = s.replace(":", "")
    s = s.replace("_", "")
    s = s.replace("+", "")
    s = s.replace("ª", "")
    s = s.replace("-", "")
    s = s.replace("<", "")
    s = s.replace(">", "")
    s = s.replace("!", "")
    s = s.replace("?", "")
    s = s.replace("(", "")
    s = s.replace(")", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    s = s.replace("'", "")
    s = s.replace("0", "")
    s = s.replace("1", "")
    s = s.replace("2", "")
    s = s.replace("3", "")
    s = s.replace("4", "")
    s = s.replace("5", "")
    s = s.replace("6", "")
    s = s.replace("7", "")
    s = s.replace("8", "")
    s = s.replace("9", "")
    s = s.replace("%", "")
    s = s.strip()
    s = s.lower()
    return s

# Rimozione delle stopword
def clar_text(text):
    t = remove_symbol(str(text).strip().lower())
    tokens = list(str(text).lower().split(" "))
    for z in range(0, len(stop_word)):
        if stop_word[z] in tokens:
            while stop_word[z] in tokens:
                tokens.remove(str(stop_word[z]))

    tt = ""
    for it in tokens:
      tt = tt +" "+it
    return tt

def _pad(input_ids, max_seq_len):
    x = []
    input_ids = input_ids[:min(len(input_ids), max_seq_len - 2)]
    input_ids = input_ids + [0] * (max_seq_len - len(input_ids))
    return np.array(input_ids)

#LOADING DATASET
df = pd.read_csv('/content/drive/My Drive/codiesp/Train_with_emptyclass.csv')
df = df[['Code', 'Desc']]
# df = df[pd.notnull(df['desc'])]
#print(df.head(10))
print(df.shape)

df.index = range(df.shape[0])
print("Parole: " + str(df['Desc'].apply(lambda x: len(x.split(' '))).sum()))  # ci sono circa 211456 parole

# rimozione SOLO dei simboli (nessuno stemming e nessuna rimozione delle stopword)
df['Desc'] = df['Desc'].apply(remove_symbol)
#print(df.head(10))

# Acquisizione delle stop word
file_stopw = open("/content/drive/My Drive/codiesp/stop_word.pck", "rb")
stop_word = pickle.load(file_stopw)
df['Desc'] = df['Desc'].apply(clar_text)

#suddivisione train_test
train, test = train_test_split(df, test_size=0.3, random_state=42)

#prepare class encoder
le = ce.OneHotEncoder(return_df=False, impute_missing=False, handle_unknown="ignore")
labels = le.fit(list(df['Code']))
mapa = [0,1]

labels_map = [0,1]
#i = 0
#for a in mapa:
#    labels_map.append(a)
#print(labels_map)

#Tokenization
#Inizialize the tokenizer
tokenizer = tokenization.FullTokenizer(vocab_path, do_lower_case=True)
indices_train = []
indices_test = []

for text in train['Desc']:
  tk = tokenizer.tokenize(text)
  tokens = ["[CLS]"] + tk + ["[SEP]"]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  token_ids = _pad(token_ids,SEQ_LEN)
  indices_train.append(token_ids)

for text in test['Desc']:
  tk = tokenizer.tokenize(text)
  tokens = ["[CLS]"] + tk + ["[SEP]"]
  token_ids = tokenizer.convert_tokens_to_ids(tokens)
  token_ids = _pad(token_ids,SEQ_LEN)
  indices_test.append(token_ids)

indices_train = [indices_train, np.zeros_like(indices_train)]
indices_test= [indices_test, np.zeros_like(indices_test)]

train_labels = train['Code']
train_labes_indexes = []
for label in train_labels:
  if(label =='emp'):
    train_labes_indexes.append(0)
  else:
    train_labes_indexes.append(1)


(49299, 2)
Parole: 1600976



In [7]:
!pip install joblib
import joblib

labels_map = joblib.load('/content/drive/My Drive/codiesp/labels_map_bert_only_01.joblib')



In [8]:
bert = load_trained_model_from_checkpoint(
    config_file=config_path,
    checkpoint_file=checkpoint_path,
    training=True,
    trainable=True,
    seq_len=128
)

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
# @title Build Custom Model

inputs = bert.inputs[:2]
dense = bert.get_layer('NSP-Dense').output
dense1 = keras.layers.Dense(units=1000, activation='tanh') (dense)
outputs = keras.layers.Dense(units=2, activation='softmax')(dense1)
modelk = keras.models.Model(inputs, outputs)
modelk.compile(
    RAdam(lr=LR),
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy'],
)

In [10]:
# @title Initialize Variables
sess = K.get_session()
uninitialized_variables = set([i.decode('ascii') for i in sess.run(tf.report_uninitialized_variables())])
init_op = tf.variables_initializer(
    [v for v in tf.global_variables() if v.name.split(':')[0] in uninitialized_variables]
)
sess.run(init_op)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [0]:
# @title Fit

#filepath="/content/drive/My Drive/codiesp/bert_only_01.{epoch:05d}-{val_loss:.5f}.hdf5"
#checkpoint = ModelCheckpoint(filepath, monitor='val_sparse_categorical_accuracy', verbose=1, save_best_only=True, mode='max')

#    checkpoint
#callbacks_list = [
#]

#modelk.fit(
#    indices_train,
#    train_labes_indexes,
#    epochs=50,
#    batch_size=32,
#    validation_split = 0.10,
#    callbacks=callbacks_list
#)

In [0]:
modelk.load_weights('/content/drive/My Drive/codiesp/bert_only_01.00003-0.14749.hdf5')

In [13]:
# @title Predict
predicts = modelk.predict(indices_test, verbose=True)
print(predicts[0])

[0.06539832 0.93460166]


In [14]:
res_encoded = []
for a in predicts:
    val = a.argmax()
    res_encoded.append(labels_map[val])

print(res_encoded)
#res_labels = le.inverse_transform(DataFrame.res_encoded)

test_labels = test['Code']
test_labes_indexes = []
for label in test_labels:
  if label == 'emp':
    test_labes_indexes.append(0)
  else:
    test_labes_indexes.append(1)



print('Testing accuracy %s' % accuracy_score(test_labes_indexes, res_encoded))
print('Testing F1 score: {}'.format(f1_score(test_labes_indexes, res_encoded, average='weighted')))


[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [15]:
!pip install scikit-learn
from sklearn.metrics import classification_report



In [16]:
print(classification_report(test_labes_indexes, res_encoded,digits=5))

              precision    recall  f1-score   support

           0    0.97183   0.96224   0.96701     12263
           1    0.82515   0.86466   0.84444      2527

    accuracy                        0.94557     14790
   macro avg    0.89849   0.91345   0.90573     14790
weighted avg    0.94677   0.94557   0.94607     14790



#CLASSIFICATION pathologies with their numbers


In [17]:
#Estraggo elementi di cui si crede ci sia la patologia
left_to_classify = []
left_to_classify_labels = []
res_encoded_original = res_encoded.copy()

test_desc = np.array(test['Desc'])
test_bal = np.array(test['Code'])


i = 0
for item in res_encoded:
  if item == 1:
    left_to_classify.append(test_desc[i])
    left_to_classify_labels.append(test_bal[i])
  i = i+1

print(left_to_classify)


['  meses tratamiento análisis control mostró bilirrubina total  got  gpt  ggt  fa  pudiendo comenzar tratamiento quimioterapia segunda línea docetaxel elevación psa reaparición dolor columna dorsal hombro derecho', ' gestante  años grupo sanguíneo positivo antecedentes personales meningitis  años legrado embarazo molar gpa primera gestación  fumadora  cigarrillos/día', ' si bien podía discernir silueta renal superficie abollonada presentaba numerosas formaciones quísticas contenido seroso corte dichos quistes mostraban tamaño heterogéneo siendo mayores situados nivel cortical dando riñón aspecto esponja', ' acude urgencias «pérdida distorsión visión» ojo izquierdo oi varios días evolución', ' varón  años encontrado accidentado vía pública trauma cráneoencefálico fracturas extremidades superior inferior derechas costales enfisema subcutáneo intenso hematuria leve', ' hallazgos servicio cirugía vascular decide plantear tratamiento quirúrgico aneurisma', ' paciente  años exfumador antece

In [0]:
# Tokenizzazione e rimozione delle stopword
def tokenize_text(text):
    tokens = list(str(text).lower().split(" "))
    for z in range(0, len(stop_word)):
        if stop_word[z] in tokens:
            while stop_word[z] in tokens:
                tokens.remove(str(stop_word[z]))
    return tokens

In [0]:
# prepare tokenizer
#t = Tokenizer()
#t.fit_on_texts(df['Desc'])
#vocab_size = len(t.word_index) + 1

t = joblib.load("/content/drive/My Drive/codiesp/lstm_fasttext__tokenizer.vec")

In [20]:
#padding text
encoded_left_to_classify = t.texts_to_sequences(left_to_classify)

max_length = 64
padded_left_to_classify = pad_sequences(encoded_left_to_classify, maxlen=max_length, padding='post')
print(padded_left_to_classify[76])

[  15   45   20   75 6184 6185 6186 6187  408  695  513  255 7423 1611
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


In [0]:
#le = ce.OneHotEncoder(return_df=False, impute_missing=False, handle_unknown="ignore")
#labels = le.fit(list(df['Code']))
le = joblib.load("/content/drive/My Drive/codiesp/lstm_fasttext__label_encoder_le.vec")

In [0]:
# get the vectors
#file = open('/content/drive/My Drive/codiesp/embeddings-l-model_es.vec')

# create a weight matrix for words in training docs
#count = 0
embedding_matrix = joblib.load('/content/drive/My Drive/codiesp/lstm_fasttext__embedding_matrix_medical.vec')
#np.zeros((vocab_size, 300))
#vocab_and_vectors = {}
#arrValues = []
#z = 0
#for line in file:
#    if(z != 0):
#        values = line.split()
#        word = values[0]
#        vector = np.asarray(values[1:], dtype='float32')
#        vocab_and_vectors[word] = vector
#        arrValues.append(vector)
#    else:
#        z= z+1

##for word, i in t.word_index.items():
#    try:
#        embedding_vector = vocab_and_vectors.get(word)#
#
#        if embedding_vector is None:
#            count = count + 1
#            index = random.randint(0, 1000)
#            embedding_vector = arrValues[index]
#    except:
#        count = count+1
#        index = random.randint(0, 1000)
#        embedding_vector = arrValues[index]

#    if embedding_vector is not None:
#        embedding_matrix[i] = embedding_vector
#print(count)

In [23]:
# define the model
vocab_size = len(t.word_index) + 1
input = Input(shape=(64,))
m = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=64, trainable=False) (input)
bi = Bidirectional(LSTM(64, activation ='tanh', return_sequences = True, dropout=0.3)) (m)

aa = SeqSelfAttention(attention_activation='tanh') (bi)
aa = Conv1D(128,5, activation ='relu' ) (aa)
aa = MaxPool1D(2) (aa)
aa = Dropout(0.2) (aa)

added = keras.layers.Concatenate(axis=1)([aa,bi])

ff = GlobalMaxPool1D() (added)
ff = Dense(4000)(ff)
ff = Dropout(0.3) (ff)
ff =Dense(1788, activation='softmax') (ff)

model = keras.models.Model(inputs=[input], outputs=[ff])

model.summary(line_length=100)
model.compile (loss='categorical_crossentropy' , optimizer='adam' , metrics=[ 'accuracy'] )
model.load_weights('/content/drive/My Drive/codiesp/LSTM_CNN_ATT_Fasttext_final_03052020.h5')



Model: "model_3"
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 64)            0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 64, 300)       3275400     input_1[0][0]                    
____________________________________________________________________________________________________
bidirectional_1 (Bidirectional)  (None, 64, 128)       186880      embedding_1[0][0]                
____________________________________________________________________________________________________
seq_self_attention_1 (SeqSelfAtt (None, 64, 128)       8257        bidirectional_1[0][0]            
_________________________________________________________________________

In [24]:
#predict
res = model.predict(padded_left_to_classify)

map = le.category_mapping[0]['mapping']
print(map)
print(res_encoded)

labels_map = []
i = 0
for a,b in map:
    labels_map.append(a)

res_encoded = []
for a in res:
    val = a.argmax()
    res_encoded.append(labels_map[val])

print(left_to_classify_labels)

[('z20.818', 1), ('m25.50', 2), ('a23.9', 3), ('i87.8', 4), ('e04.9', 5), ('r50.9', 6), ('n44.8', 7), ('r60.9', 8), ('n45.3', 9), ('r52', 10), ('i83.90', 11), ('r58', 12), ('d30.3', 13), ('d49.59', 14), ('n50.9', 15), ('r63.4', 16), ('r19.00', 17), ('n50.89', 18), ('c63.10', 19), ('k74.60', 20), ('g82.20', 21), ('i85.00', 22), ('n32.9', 23), ('r20.1', 24), ('z96.0', 25), ('g04.1', 26), ('c22.0', 27), ('k76.6', 28), ('k70.30', 29), ('z98.85', 30), ('i85.01', 31), ('r74.0', 32), ('b33.3', 33), ('r53.1', 34), ('i10', 35), ('r53.81', 36), ('g71.8', 37), ('g82.21', 38), ('r18.8', 39), ('n32.89', 40), ('n32.0', 41), ('k40.90', 42), ('n13.30', 43), ('n28.89', 44), ('c78.7', 45), ('c77.9', 46), ('k56.60', 47), ('n13.9', 48), ('n13.8', 49), ('r59.9', 50), ('k31.7', 51), ('c64.9', 52), ('c16.9', 53), ('n28.9', 54), ('r18.0', 55), ('c80.0', 56), ('k59.00', 57), ('d64.9', 58), ('q61.3', 59), ('d50.9', 60), ('r63.0', 61), ('r10.2', 62), ('n80.9', 63), ('n85.8', 64), ('k44.9', 65), ('r31.9', 66), ('

In [25]:
print('Testing accuracy %s' % accuracy_score(left_to_classify_labels, res_encoded))
print('Testing F1 score: {}'.format(f1_score(left_to_classify_labels, res_encoded, average='weighted')))

from sklearn.metrics import classification_report
#print(classification_report(left_to_classify_labels, res_encoded,digits=5))

Testing accuracy 0.36027190332326287
Testing F1 score: 0.31052821458782615


#Merge back the results


In [0]:
final_res = []

i = 0
for item in res_encoded_original:
  if item == 0:
    final_res.append('emp')
  else:
    final_res.append(res_encoded[i])
    i = i+1


In [27]:
print('Testing accuracy %s' % accuracy_score(test_bal, final_res))
print('Testing F1 score: {}'.format(f1_score(test_bal, final_res, average='weighted')))
print(classification_report(test_bal, final_res,digits=5))

Testing accuracy 0.8623394185260311
Testing F1 score: 0.8601870502236996
              precision    recall  f1-score   support

     0wjg4zz    0.00000   0.00000   0.00000         0
      8550/3    0.00000   0.00000   0.00000         0
      B96.20    0.00000   0.00000   0.00000         1
      D41.02    0.00000   0.00000   0.00000         2
      D44.10    0.00000   0.00000   0.00000         0
       D44.2    0.00000   0.00000   0.00000         1
       D49.0    0.00000   0.00000   0.00000         1
     D49.511    0.00000   0.00000   0.00000         1
     D49.519    0.00000   0.00000   0.00000         2
      E83.51    0.00000   0.00000   0.00000         1
      H35.89    0.00000   0.00000   0.00000         1
     H44.002    0.00000   0.00000   0.00000         2
      H55.01    0.00000   0.00000   0.00000         1
       I35.8    0.00000   0.00000   0.00000         1
         K30    0.00000   0.00000   0.00000         0
       K76.9    0.00000   0.00000   0.00000         1
       L

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
