In [1]:
package_jar = '../target/spark-data-repair-plugin_2.12_spark3.1_0.1.0-EXPERIMENTAL-with-dependencies.jar'

In [2]:
import numpy as np
import pandas as pd
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import functions as f

spark = SparkSession.builder \
    .config('spark.jars', package_jar) \
    .config('spark.deriver.memory', '8g') \
    .enableHiveSupport() \
    .getOrCreate()

# Suppresses user warinig messages in Python
import warnings
warnings.simplefilter("ignore", UserWarning)

# Suppresses `WARN` messages in JVM
spark.sparkContext.setLogLevel("ERROR")

In [3]:
from repair.api import Scavenger
Scavenger().version()

'0.1.0-spark3.1-EXPERIMENTAL'

In [4]:
spark.read.option("header", True).csv("../testdata/hospital.csv").createOrReplaceTempView("hospital")
spark.table('hospital').printSchema()

root
 |-- tid: string (nullable = true)
 |-- ProviderNumber: string (nullable = true)
 |-- HospitalName: string (nullable = true)
 |-- Address1: string (nullable = true)
 |-- Address2: string (nullable = true)
 |-- Address3: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- ZipCode: string (nullable = true)
 |-- CountyName: string (nullable = true)
 |-- PhoneNumber: string (nullable = true)
 |-- HospitalType: string (nullable = true)
 |-- HospitalOwner: string (nullable = true)
 |-- EmergencyService: string (nullable = true)
 |-- Condition: string (nullable = true)
 |-- MeasureCode: string (nullable = true)
 |-- MeasureName: string (nullable = true)
 |-- Score: string (nullable = true)
 |-- Sample: string (nullable = true)
 |-- Stateavg: string (nullable = true)



In [5]:
import altair as alt

charts = []
pdf = spark.table('hospital').toPandas()
cols = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'State', 'ZipCode', 'CountyName', 'PhoneNumber', 'HospitalType', 'HospitalOwner', 'EmergencyService', 'Condition', 'MeasureCode', 'MeasureName', 'Score', 'Sample', 'Stateavg']

for c in cols:
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [6]:
spark.read.option("header", True).csv("../bin/testdata/hospital_error_cells.csv").createOrReplaceTempView("hospital_error_cells")
spark.table('hospital_error_cells').printSchema()

root
 |-- tid: string (nullable = true)
 |-- attribute: string (nullable = true)
 |-- correct_val: string (nullable = true)



In [7]:
from repair.detectors import NullErrorDetector, ConstraintErrorDetector
error_detectors = [ 
    ConstraintErrorDetector(constraint_path="../testdata/hospital_constraints.txt"),
    NullErrorDetector()
]

from repair.model import RepairModel
model = RepairModel().setTableName('hospital').setRowId('tid').setDiscreteThreshold(100) 
noisy_cells_df, noisy_columns = model.setErrorDetectors(error_detectors)._detect_errors('hospital', 20, 1000)

In [8]:
import altair as alt

pdf = noisy_cells_df.toPandas()
alt.Chart(pdf.sample(n=3000)).mark_bar().encode(x=alt.X('attribute'), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=400, height=400)

In [9]:
discretized_table, discretized_columns, distinct_stats = model._discretize_attrs('hospital')
discretized_columns

['ProviderNumber',
 'HospitalName',
 'Address1',
 'City',
 'State',
 'ZipCode',
 'CountyName',
 'PhoneNumber',
 'HospitalType',
 'HospitalOwner',
 'EmergencyService',
 'Condition',
 'MeasureCode',
 'MeasureName',
 'Score',
 'Stateavg']

In [10]:
target_columns = list(filter(lambda c: c in discretized_columns, noisy_columns))
target_columns

In [11]:
cell_domain, pairwise_stats = model._analyze_error_cell_domain(noisy_cells_df, discretized_table, [], target_columns, discretized_columns, 1000)

In [12]:
import altair as alt

charts = []

for target, cols in pairwise_stats.items():
    pdf = pd.DataFrame(cols, columns=[target, 'cor'])
    pdf['cor'] = pdf['cor'].astype('float')
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(target), y=alt.Y('cor')).properties(width=200, height=200))
    
alt.hconcat(*charts)

In [13]:
error_cells_df, weak_labeled_cells_df_opt = model._extract_error_cells(noisy_cells_df, cell_domain, 1000, 20)

In [None]:
repair_base_df = model._prepare_repair_base_cells('hospital', noisy_cells_df, target_columns, 1000, 20)
repair_base_df = model._repair_attrs(weak_labeled_cells_df_opt, repair_base_df)

In [None]:
import altair as alt

charts = []
pdf = repair_base_df.toPandas()
cols = ['ProviderNumber', 'HospitalName', 'Address1', 'City', 'State', 'ZipCode', 'CountyName', 'PhoneNumber', 'HospitalType', 'HospitalOwner', 'EmergencyService', 'Condition', 'MeasureCode', 'MeasureName', 'Score', 'Sample', 'Stateavg']

for c in cols:
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)