In [75]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from keras.models import load_model
import h5py
import pickle
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from gensim.models import KeyedVectors
from gensim.models import FastText
from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from keras.models import load_model
from enum import Enum
import itertools
from collections import defaultdict
import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 20, 7
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [76]:
df = pd.read_csv('HospitalErrors.csv', encoding='utf8', dtype=object)

In [77]:
df.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,Stateavg
0,10021,dale medical center,126 hospital ave,ozark,al,36360,dale,3347742601,acute care hospitals,government - hospital district or authority,yxs,surgical infection prevention,scip-inf-6,surgery patients needing hair removed from the...,100%,94 patients,al_scip-inf-6:0
1,10035,cullman regional medical center,1912 alabama highway 157,cullman,al,35058,cullman,2567x72000,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-vte-1,surgery patients whose doctors ordered treatme...,95%,298 patients,al_scip-vte-1:0


In [78]:
len(df)
sum(df.isnull().values.ravel())

18

In [79]:
dfEmpty = df[df.isnull().any(axis=1)]

In [80]:
dfEmpty = dfEmpty.apply(lambda x: x.astype(str).str.lower())

In [81]:
df_Nan = df
df_Nan = df_Nan[~df_Nan.isin(dfEmpty)].dropna()
df_Nan.to_csv("HospitalErrorsWithoutNan.csv", index=False)

In [82]:
path = "HospitalWord2Vec.w2v"
word2vecModel = KeyedVectors.load(path)

In [83]:
mapping = dict(enumerate(dfEmpty.columns.astype('category').categories))

In [84]:
attributeMapping = {}
for i,idx in enumerate(dfEmpty.columns):
    attributeMapping[idx] = i

In [85]:
inverseMapping = {y:x for x,y in attributeMapping.items()}

## Load Attribute Classifier

In [86]:
model = load_model('HospitalMultiAttributeClassifier.h5')

In [87]:
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Repair pipeline

In [88]:
def impute(word2vecModel, model, tokenizer, rowWithMissingValue, missingType, topN=10):
    """
    Returns the closest match for the missing attribute value
    """
    output = dict()
    for value in rowWithMissingValue:
        try:
            results = word2vecModel.wv.most_similar(value, topn=topN)
            for match, confidence in results:
                # Predicted type to be equal to the missing value
                if predictAttribute(model, tokenizer, match) == missingType:
                    if match in output and confidence <= output[match]:
                        continue
                    else:
                        output[match] = confidence            
        except KeyError:
            continue
    return output.items()

In [89]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=184)
    predictions = model.predict(testData)
    return attributeMapping[mapping[np.argmax(predictions[0])]]

In [90]:
nullRows = dfEmpty.values.tolist()
rows = []
#columnNames = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'ZipCode', 'CountyName', 'PhoneNumber','HospitalOwner','Condition','Sample']
columnNames = dfEmpty.columns.tolist()
for row in nullRows:
    missingAttribute = None
    removedNan = []
    for i in range(len(row)):
        if str(row[i]) != 'nan':
            removedNan.append(row[i])
        else:
            missingAttribute = attributeMapping[columnNames[i]]
    if missingAttribute is not None:
        rows.append({tuple(removedNan):missingAttribute})
    else:
        print(row)

## Imputation

In [91]:
def getImputedValue(missingRow, attributeType):
    results = impute(word2vecModel, model, tokenizer, missingRow, attributeType, 100)
    return sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None

## Build verification pipeline

In [92]:
dfClean = pd.read_csv('clean_hospital_dataset_hc.csv', encoding='utf8', dtype='object')
dfClean.replace(np.nan,'empty',inplace = True)

In [93]:
dfUnique = dfClean.drop_duplicates()

In [94]:
queryMapping = {}
attributeNames = dfClean.columns.tolist()
for i in attributeNames:
    for j in attributeNames:
        if i!=j:
            if i in queryMapping:
                queryMapping[i] = queryMapping.get(i) + ' and  ' + str(j + '=="{}"')
            else:
                queryMapping[i] = str(j + '=="{}"') 

In [95]:
correct = 0
inCorrect = 0
incorrectPredictions = []

In [96]:
count = 0
for row in rows:
    missingRow = list(row.keys())[0]
    attribute = list(row.values())[0]
    missingRow = [i.strip() for i in missingRow]
    # Run the query
    query = (queryMapping[inverseMapping[attribute]]).format(*missingRow)
    outputDf = dfClean.query(query)
    actual = outputDf.head(1)[inverseMapping[attribute]].to_string(index=False)
    if actual == 'empty':
        continue
    predicted = getImputedValue(missingRow, attribute) 
    if predicted and actual == predicted[0]:
        correct += 1
    else:
        inCorrect += 1
        incorrectPredictions.append((actual, predicted[0] if predicted else None)) 

## Read data
Details here: https://fasttext.cc/

## For truth validation read clean dataset

In [97]:
df_truth = pd.read_csv('clean_hospital_dataset_hc.csv',dtype=object, encoding='utf8', index_col=False)

## Read dirty dataset from imputation

In [98]:
df_dirty = pd.read_csv('HospitalErrorsWithoutNan.csv',dtype=object, encoding='utf8', index_col=False)

In [99]:
df_truth.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,Stateavg
0,10018,callahan eye foundation hospital,1720 university blvd,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-card-2,surgery patients who were taking heart drugs c...,empty,empty,al_scip-card-2
1,10018,callahan eye foundation hospital,1720 university blvd,birmingham,al,35233,jefferson,2053258100,acute care hospitals,voluntary non-profit - private,yes,surgical infection prevention,scip-inf-1,surgery patients who were given an antibiotic ...,empty,empty,al_scip-inf-1


In [100]:
df_dirty.head(2)

Unnamed: 0,ProviderNumber,HospitalName,Address1,City,State,ZipCode,CountyName,PhoneNumber,HospitalType,HospitalOwner,EmergencyService,Condition,MeasureCode,MeasureName,Score,Sample,Stateavg
0,10021,dale medical center,126 hospital ave,ozark,al,36360,dale,3347742601,acute care hospitals,government - hospital district or authority,yxs,surgical infection prevention,scip-inf-6,surgery patients needing hair removed from the...,100%,94 patients,al_scip-inf-6:0
1,10035,cullman regional medical center,1912 alabama highway 157,cullman,al,35058,cullman,2567x72000,acute care hospitals,government - hospital district or authority,yes,surgical infection prevention,scip-vte-1,surgery patients whose doctors ordered treatme...,95%,298 patients,al_scip-vte-1:0


## Preprocessing

In [101]:
combined_hosp = df_truth.values.tolist()
combined_dirty = df_dirty.values.tolist()

In [102]:
columns = df_truth.columns.values

In [103]:
uniqueValues = {}
for i in df_truth.columns:
    uniqueValues[i] = set(df_truth[i].tolist())  

In [104]:
columns

array(['ProviderNumber', 'HospitalName', 'Address1', 'City', 'State',
       'ZipCode', 'CountyName', 'PhoneNumber', 'HospitalType',
       'HospitalOwner', 'EmergencyService', 'Condition', 'MeasureCode',
       'MeasureName', 'Score', 'Sample', 'Stateavg'], dtype=object)

## Model

In [105]:
path = 'HospitalFastText.w2v'
fastTextModel = KeyedVectors.load(path)

In [106]:
model = load_model('HospitalMultiAttributeClassifier.h5')
with open('HospitalMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [107]:
mapping = dict(enumerate(sorted(df_truth.columns.values)))

In [108]:
mapping

{0: 'Address1',
 1: 'City',
 2: 'Condition',
 3: 'CountyName',
 4: 'EmergencyService',
 5: 'HospitalName',
 6: 'HospitalOwner',
 7: 'HospitalType',
 8: 'MeasureCode',
 9: 'MeasureName',
 10: 'PhoneNumber',
 11: 'ProviderNumber',
 12: 'Sample',
 13: 'Score',
 14: 'State',
 15: 'Stateavg',
 16: 'ZipCode'}

In [109]:
attributeMapping = {}
for i,idx in enumerate(df_truth.columns):
    attributeMapping[i] = idx

## Correction Pipeline

In [110]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=184)
    predictions = model.predict(testData)
    return mapping[np.argmax(predictions[0])]

In [111]:
def correctCell(fastTextModel, model, tokenizer, row, topN=10):
    cellValues = {}
    output = dict()
    isMistake = False
    for cellIndex in range(len(row)):
        currentCellValue = row[cellIndex]
        if not currentCellValue in uniqueValues[attributeMapping[cellIndex]]:
            isMistake = True
            cellValues['mistakeDetected'] = currentCellValue
            try:
                predictions = fastTextModel.most_similar(currentCellValue, topn=topN)
                for match, confidence in predictions:
                # Predicted type to be equal to the missing value
                    if predictAttribute(model, tokenizer, match) == columns[cellIndex]:
                        if match in output and confidence <= output[match]:
                            continue
                        else:
                            output[match] = confidence
            except:
                possibleValues = uniqueValues[attributeMapping[cellIndex]]
                maxScore = 0.0
                for i in possibleValues:
                    str1 = set(currentCellValue)
                    str2 = set(i)
                    score = float(len(str1 & str2)) / len(str1 | str2)
                    if maxScore < score:
                        maxScore = score
                        output[i] = score
    
    if isMistake:
        results = output.items()
        predictedValue = sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None
        cellValues['predictedValue'] = predictedValue[0] if predictedValue else None
        return cellValues
    else:
        return None

## Verification pipeline

In [112]:
dfUnique = df_truth.drop_duplicates()

In [113]:
len(dfUnique)

1000

In [114]:
queryMapping = {}
attributeNames = df_truth.columns.tolist()
for i in attributeNames:
    for j in attributeNames:
        if i!=j:
            if i in queryMapping:
                queryMapping[i] = queryMapping.get(i) + ' and  ' + str(j + '=="{}"')
            else:
                queryMapping[i] = str(j + '=="{}"') 

In [115]:
correct = 0
inCorrect = 0
incorrectPredictions = []

In [119]:
for row in combined_dirty:
    output = correctCell(fastTextModel, model, tokenizer, row, 15)
    print(output)
    if output:
        detectedError = output['mistakeDetected']
        predictedValue = output['predictedValue']
    
        tempRow = row[:]
        errorIndex = row.index(detectedError)
        tempRow.remove(detectedError)

        # Run the query
        #print(tempRow)
        query = (queryMapping[columns[errorIndex]]).format(*tempRow)
        outputDf = dfUnique.query(query)
        print(query)
        actual = outputDf.head(1).values.tolist()[0][errorIndex]

        if actual == predictedValue:
            correct += 1
        else:
            inCorrect += 1
            incorrectPredictions.append((actual, detectedError, predictedValue, errorIndex))

{'mistakeDetected': 'al_scip-inf-6:0', 'predictedValue': 'yes'}
ProviderNumber=="10021" and  HospitalName=="dale medical center" and  Address1=="126 hospital ave" and  City=="ozark" and  State=="al" and  ZipCode=="36360" and  CountyName=="dale" and  PhoneNumber=="3347742601" and  HospitalType=="acute care hospitals" and  HospitalOwner=="government - hospital district or authority" and  EmergencyService=="yxs" and  Condition=="surgical infection prevention" and  MeasureCode=="scip-inf-6" and  MeasureName=="surgery patients needing hair removed from the surgical area before surgery who had hair removed using a safer method (electric clippers or hair removal cream �c not a razor)" and  Score=="100%" and  Sample=="94 patients"


IndexError: list index out of range

## Results

In [None]:
correct

In [None]:
inCorrect

In [None]:
incorrectPredictions

In [None]:
accuracy = correct / (correct + inCorrect) * 1.0

In [None]:
accuracy

In [None]:
errorAnalysis = defaultdict(int)
for actual, detectedError, predictedValue, errorIndex  in incorrectPredictions:
    errorAnalysis[errorIndex] += 1

In [None]:
plt.bar(range(len(errorAnalysis)), list(errorAnalysis.values()), align='center')
plt.xticks(range(len(errorAnalysis)), list(errorAnalysis.keys()))
plt.show()

In [None]:
inverseMapping