In [30]:
import numpy as np
import pandas as pd
np.random.seed(0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold  # , cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, precision_recall_fscore_support, confusion_matrix  # , classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import unicodedata

import tqdm
import pickle



In [2]:
data = pd.read_csv('./data/preClasificacion.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,idGenero,edad_grupos,idEstadoCivil,Analisis,Medicion1_PA_Sistolica,Medicion2_PA_Diastolica,Medicion3_Frecuencia_cardiaca,Medicion4_Frec_Respiratoria,Medicion5_Temperatura,...,Medicion8_IMC,motivoPaciente,enfermedadActual,Zona2_Cabeza_Cuello,Zona3_Cardio_Pulmonar,codigoDiagnostico,Concepto,id,original,dataset
0,0,1,1-67,9,HIPERTENCION,138,82,64,19,36.3,...,28.73,CONTROL,PACIENTE DE 60 AÑOS DE EDAD CON DIAGNOSTICOS ...,Sin alteraciones.,RSCS RIMTICOS SIN SOPLOS RSRS MURMULLO EVSICUL...,I10X,PACIENTE DE 60 AÑOS DE EDAD CON DIAGNOSTICOS H...,1,True,train
1,1,2,78-84,1,HIPERTENCION,187,64,73,17,36.2,...,30.42,""" TENEMOS EL CONTROL """,PACIENTE DE 76 AÑOS CON DIAGNOSTICO: 1. HIP...,Sin alteraciones.,RSRS SIN AGREGADOS RSCS RITMICOS SIN SOPLOS,I10X,PACIENTE EN PROGRAMA CRONICO; CURSA CON HTA EN...,2,True,train
2,8,2,1-67,3,HIPERTENCION,118,75,90,19,36.2,...,26.02,CONTROL CRÓNICOS CAPITAL SALUD NATURAL LA CAP...,PACIENTE FEMENINA DE 54 AÑOS CON DIAGNÓSTICOS ...,SIN CIANOSIS,RUIDOS CARDIOVASCULAR RUIDOS RESPIRATORIOS CO...,I10X,PACIENTE FEMENINA CON DIAGNÓSTICOS ANOTADOS S...,9,True,train
3,9,2,>84,1,HIPERTENCION,158,59,58,15,36.0,...,24.89,CONTROL PROGRAMA CRONICOS,PACIENTE DE 84 AÑOS CON DIAGNOSTICOS DE: 1. E...,NO INGURGITACION YUGULAR.,RSCS RITMICOS SIN SOPLOS. RSRS MV CONSERVADOS ...,I10X,PACIENTE ADULTA MAYOR CON DIAGNOSTICOS ANOTAD...,10,True,train
4,11,2,1-67,9,HIPERTENCION,112,76,95,20,36.6,...,34.19,CONTROL,PACIENTE DE 42 AÑOS DE EDAD CON DIAGNOSTICO DE...,Sin alteraciones.,RSCS RITMICOS SIN SOPLOS RSRS MURMULLO EVSICUL...,I10X,PACIENTE DE 42 AÑOS DE EDAD CON DIAGNOSTICO DE...,12,True,train


In [4]:
def normunicode_data(string):
    '''
     Normalise (normalize) unicode data in Python to remove umlauts, accents etc.
    '''
    #print(strin)
    return unicodedata.normalize('NFKD', string).encode('ASCII', 'ignore').decode("utf-8").lower()

In [5]:
textFields = ['motivoPaciente','enfermedadActual','Zona2_Cabeza_Cuello','Zona3_Cardio_Pulmonar','Concepto']

In [6]:
for tf in tqdm.tqdm(textFields):
    # quita acentos, dieresis, etc
    data[tf] = data[tf].apply(lambda row: normunicode_data(row))
    # quita numeros
    data[tf] = data[tf].str.replace('\d+', '')
    # quita puntuacion: ,.:;"
    data[tf] = data[tf].str.replace('[^\w\s]','')



100%|██████████| 5/5 [00:15<00:00,  2.26s/it]


In [7]:
train = data[data.dataset=='train']
test = data[data.dataset=='test']

In [8]:
train['motivoPaciente'].shape

(72426,)

In [28]:

vectorizers  = {}
for tf in textFields:
    print('processing',tf)
    
    colToAnalyse = train[tf]
    
        
    best_params = {'v__max_features': 2000 if tf=='enfermedadActual' else 1000 , 'v__ngram_range': (1, 3)}

    
    vectorizer = TfidfVectorizer(
        ngram_range=best_params['v__ngram_range'], 
        max_features=best_params['v__max_features'])
    
    print('calculating tf-idf matrix...')
    X = vectorizer.fit_transform(train[tf])
    
    print('converting matrix to dataframe...')
    featdf = pd.DataFrame(X.toarray(),columns=[tf+ " " + x for x in vectorizer.get_feature_names()])
    
    print('saving dataframe...')
    featdf.to_csv('./data/uncleanTfIdf/'+tf+'.csv')
    
    vectorizers[tf] = vectorizer
    
    # free up memory 
    del featdf
    del X
    
    
    
    
    

processing motivoPaciente
calculating tf-idf matrix...
converting matrix to dataframe...
saving dataframe...
processing enfermedadActual
calculating tf-idf matrix...
converting matrix to dataframe...
saving dataframe...
processing Zona2_Cabeza_Cuello
calculating tf-idf matrix...
converting matrix to dataframe...
saving dataframe...
processing Zona3_Cardio_Pulmonar
calculating tf-idf matrix...
converting matrix to dataframe...
saving dataframe...
processing Concepto
calculating tf-idf matrix...
converting matrix to dataframe...
saving dataframe...


In [32]:
# todo guardar vectorizers
with open('./vectorizers/nonCleanWords.pickle', 'wb') as handle:
    pickle.dump(vectorizers, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# to read
# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)

In [41]:
for tf in textFields:
    newDf = pd.read_csv('./data/uncleanTfIdf/'+tf+'.csv')
    newDf = newDf.drop(['Unnamed: 0'],axis=1)
    train = pd.concat([train, newDf], axis=1, sort=False)
    del newDf

MemoryError: 

In [49]:
vectorizer.transform(testField.iloc[:2]).toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])