In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import calendar
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from keras.models import load_model
import h5py
import pickle
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import metrics
from gensim.models import KeyedVectors
from gensim.models import FastText
from gensim.models.wrappers import FastText
from gensim.models import Word2Vec
from keras.models import load_model
from enum import Enum
import itertools
from collections import defaultdict
import matplotlib.pyplot as plt

from pylab import rcParams
rcParams['figure.figsize'] = 20, 7
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

Using TensorFlow backend.


In [2]:
df = pd.read_csv('AdultErrors.csv', encoding='utf8', dtype=object)

In [3]:
df.head(2)

Unnamed: 0,age,workclass,education,educationnum,maritalstatus,occupation,relationship,race,sex,hoursperweek,country,income
0,47,private,doctorate,16,married-civ-spouse,prof-specialty,husband,white,male,60,,>50k:0
1,27,private,hs-grad,9,married-civ-spouse,craft-repair,husband,white,mxle,40,united-states,<=50k:0


In [4]:
len(df)
sum(df.isnull().values.ravel())

3766

In [5]:
dfEmpty = df[df.isnull().any(axis=1)]

In [6]:
dfEmpty = dfEmpty.apply(lambda x: x.astype(str).str.lower())

In [7]:
df_Nan = df
df_Nan = df_Nan[~df_Nan.isin(dfEmpty)].dropna()
df_Nan.to_csv("AdultErrorsWithoutNan.csv", index=False)

In [8]:
path = "AdultWord2Vec.w2v"
word2vecModel = KeyedVectors.load(path)

In [9]:
mapping = dict(enumerate(dfEmpty.columns.astype('category').categories))

In [10]:
attributeMapping = {}
for i,idx in enumerate(dfEmpty.columns):
    attributeMapping[idx] = i

In [11]:
inverseMapping = {y:x for x,y in attributeMapping.items()}

## Load Attribute Classifier

In [12]:
model = load_model('AdultMultiAttributeClassifier.h5')

In [13]:
with open('AdultMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

## Repair pipeline

In [14]:
def impute(word2vecModel, model, tokenizer, rowWithMissingValue, missingType, topN=10):
    """
    Returns the closest match for the missing attribute value
    """
    output = dict()
    for value in rowWithMissingValue:
        try:
            results = word2vecModel.wv.most_similar(value, topn=topN)
            for match, confidence in results:
                # Predicted type to be equal to the missing value
                if predictAttribute(model, tokenizer, match) == missingType:
                    if match in output and confidence <= output[match]:
                        continue
                    else:
                        output[match] = confidence            
        except KeyError:
            continue
    return output.items()

In [15]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=26)
    predictions = model.predict(testData)
    return attributeMapping[mapping[np.argmax(predictions[0])]]

In [16]:
nullRows = dfEmpty.values.tolist()
rows = []
#columnNames = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'ZipCode', 'CountyName', 'PhoneNumber','HospitalOwner','Condition','Sample']
columnNames = dfEmpty.columns.tolist()
for row in nullRows:
    missingAttribute = None
    removedNan = []
    for i in range(len(row)):
        if str(row[i]) != 'nan':
            removedNan.append(row[i])
        else:
            missingAttribute = attributeMapping[columnNames[i]]
    if missingAttribute is not None:
        rows.append({tuple(removedNan):missingAttribute})
    else:
        print(row)

## Imputation

In [17]:
def getImputedValue(missingRow, attributeType):
    results = impute(word2vecModel, model, tokenizer, missingRow, attributeType, 100)
    return sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None

## Build verification pipeline

In [45]:
dfClean = pd.read_csv('clean_adult_dataset_hc.csv', encoding='utf8', dtype='object')

In [46]:
dfUnique = dfClean.drop_duplicates()

In [47]:
queryMapping = {}
attributeNames = dfClean.columns.tolist()
for i in attributeNames:
    for j in attributeNames:
        if i!=j:
            if i in queryMapping:
                queryMapping[i] = queryMapping.get(i) + ' and  ' + str(j + '=="{}"')
            else:
                queryMapping[i] = str(j + '=="{}"') 

In [48]:
correct = 0
inCorrect = 0
incorrectPredictions = []

In [None]:
count = 0
for row in rows:
    missingRow = list(row.keys())[0]
    attribute = list(row.values())[0]
    missingRow = [i.strip() for i in missingRow]
    # Run the query
    query = (queryMapping[inverseMapping[attribute]]).format(*missingRow)
    outputDf = dfClean.query(query)
    actual = outputDf.head(1)[inverseMapping[attribute]].to_string(index=False)
    if actual == 'empty':
        continue
    predicted = getImputedValue(missingRow, attribute) 
    if predicted and actual == predicted[0]:
        correct += 1
    else:
        inCorrect += 1
        incorrectPredictions.append((actual, predicted[0] if predicted else None)) 

## Read data
Details here: https://fasttext.cc/

## For truth validation read clean dataset

In [None]:
df_truth = pd.read_csv('clean_adult_dataset_hc.csv',dtype=object, encoding='utf8', index_col=False)

## Read dirty dataset from imputation

In [None]:
df_dirty = pd.read_csv('AdultErrorsWithoutNan.csv',dtype=object, encoding='utf8', index_col=False)

In [None]:
df_truth.head(2)

In [None]:
df_dirty.head(2)

## Preprocessing

In [None]:
combined_hosp = df_truth.values.tolist()
combined_dirty = df_dirty.values.tolist()

In [None]:
columns = df_truth.columns.values

In [None]:
uniqueValues = {}
for i in df_truth.columns:
    uniqueValues[i] = set(df_truth[i].tolist())  

In [None]:
columns

## Model

In [None]:
path = 'AdultFastText.w2v'
fastTextModel = KeyedVectors.load(path)

In [None]:
model = load_model('AdultMultiAttributeClassifier.h5')
with open('AdultMultiAttributeClassifierTokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
mapping = dict(enumerate(sorted(df_truth.columns.values)))

In [None]:
mapping

In [None]:
attributeMapping = {}
for i,idx in enumerate(df_truth.columns):
    attributeMapping[i] = idx

## Correction Pipeline

In [None]:
def predictAttribute(model, tokenizer, value):
    """
    Classifies value parameter as its corresponding attribute
    """
    value = [value]
    sequences = tokenizer.texts_to_sequences(value)
    testData = pad_sequences(sequences, maxlen=26)
    predictions = model.predict(testData)
    return mapping[np.argmax(predictions[0])]

In [None]:
def correctCell(fastTextModel, model, tokenizer, row, topN=10):
    cellValues = {}
    output = dict()
    isMistake = False
    for cellIndex in range(len(row)):
        currentCellValue = row[cellIndex]
        if not currentCellValue in uniqueValues[attributeMapping[cellIndex]]:
            isMistake = True
            cellValues['mistakeDetected'] = currentCellValue
            try:
                predictions = fastTextModel.most_similar(currentCellValue, topn=topN)
                for match, confidence in predictions:
                # Predicted type to be equal to the missing value
                    if predictAttribute(model, tokenizer, match) == columns[cellIndex]:
                        if match in output and confidence <= output[match]:
                            continue
                        else:
                            output[match] = confidence
            except:
                possibleValues = uniqueValues[attributeMapping[cellIndex]]
                maxScore = 0.0
                for i in possibleValues:
                    str1 = set(currentCellValue)
                    str2 = set(i)
                    score = float(len(str1 & str2)) / len(str1 | str2)
                    if maxScore < score:
                        maxScore = score
                        output[i] = score
    
    if isMistake:
        results = output.items()
        predictedValue = sorted(results, key=lambda x: x[1], reverse=True)[0] if results else None
        cellValues['predictedValue'] = predictedValue[0] if predictedValue else None
        return cellValues
    else:
        return None

## Verification pipeline

In [None]:
dfUnique = df_truth.drop_duplicates()

In [None]:
len(dfUnique)

In [None]:
queryMapping = {}
attributeNames = df_truth.columns.tolist()
for i in attributeNames:
    for j in attributeNames:
        if i!=j:
            if i in queryMapping:
                queryMapping[i] = queryMapping.get(i) + ' and  ' + str(j + '=="{}"')
            else:
                queryMapping[i] = str(j + '=="{}"') 

In [None]:
correct = 0
inCorrect = 0
incorrectPredictions = []

In [None]:
for row in combined_dirty:
    output = correctCell(fastTextModel, model, tokenizer, row, 15)
    if output:
        detectedError = output['mistakeDetected']
        predictedValue = output['predictedValue']
    
        tempRow = row[:]
        errorIndex = row.index(detectedError)
        tempRow.remove(detectedError)

        query = (queryMapping[columns[errorIndex]]).format(*tempRow)
        outputDf = dfUnique.query(query)
        actual = outputDf.head(1).values.tolist()[0][errorIndex]
        
        if actual == predictedValue:
            correct += 1
        else:
            inCorrect += 1
            incorrectPredictions.append((actual, detectedError, predictedValue, errorIndex))

## Results

In [None]:
incorrectPredictions

In [None]:
precision = correct / (correct + inCorrect) * 1.0

In [None]:
precision

In [None]:
errorAnalysis = defaultdict(int)
for actual, detectedError, predictedValue, errorIndex  in incorrectPredictions:
    errorAnalysis[errorIndex] += 1

In [None]:
plt.bar(range(len(errorAnalysis)), list(errorAnalysis.values()), align='center')
plt.xticks(range(len(errorAnalysis)), list(errorAnalysis.keys()))
plt.show()

In [None]:
inverseMapping