In [1]:
package_jar = '../target/spark-data-repair-plugin_2.12_spark3.2_0.1.0-EXPERIMENTAL-with-dependencies.jar'

In [2]:
import numpy as np
import pandas as pd
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import functions as f

spark = SparkSession.builder \
    .config('spark.jars', package_jar) \
    .config('spark.deriver.memory', '8g') \
    .enableHiveSupport() \
    .getOrCreate()

# Suppresses user warinig messages in Python
import warnings
warnings.simplefilter("ignore", UserWarning)

# Suppresses `WARN` messages in JVM
spark.sparkContext.setLogLevel("ERROR")

NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark classes ahead of assembly.
21/11/22 23:56:26 WARN Utils: Your hostname, maropus-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.3.4 instead (on interface en0)
21/11/22 23:56:26 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
21/11/22 23:56:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/11/22 23:56:28 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from repair.api import Delphi
Delphi().version()

'0.1.0-spark3.2-EXPERIMENTAL'

In [4]:
spark.read.option("header", True).csv("../testdata/adult.csv").createOrReplaceTempView("adult")
spark.table('adult').printSchema()

root
 |-- tid: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Income: string (nullable = true)



In [5]:
import altair as alt

charts = []
pdf = spark.table('adult').toPandas()

for c in [c for c in pdf.columns if c != 'tid']:
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [9]:
from repair.detectors import NullErrorDetector, ConstraintErrorDetector
error_detectors = [ 
    ConstraintErrorDetector(constraint_path="../testdata/adult_constraints.txt"),
    NullErrorDetector()
]

from repair.model import RepairModel
model = RepairModel().setTableName('adult').setRowId('tid')
noisy_cells_df, noisy_columns = model.setErrorDetectors(error_detectors)._detect_errors('adult')

                                                                                

In [10]:
import altair as alt

pdf = noisy_cells_df.toPandas()
alt.Chart(pdf).mark_bar().encode(x=alt.X('attribute'), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=400, height=400)

In [11]:
discretized_table, discretized_columns, distinct_stats = model._discretize_attrs('adult')
discretized_columns

['Age', 'Education', 'Occupation', 'Relationship', 'Sex', 'Country', 'Income']

In [12]:
target_columns = list(filter(lambda c: c in discretized_columns, noisy_columns))
target_columns

['Income', 'Age', 'Relationship', 'Sex']

In [17]:
import altair as alt

charts = []

for target, cols in pairwise_stats.items():
    pdf = pd.DataFrame(cols, columns=[target, 'cor'])
    pdf['cor'] = pdf['cor'].astype('float')
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(target), y=alt.Y('cor')).properties(width=200, height=200))
    
alt.hconcat(*charts)

In [24]:
error_cells_df, pairwise_stats = model._extract_error_cells('adult', noisy_cells_df, discretized_table, discretized_columns, [], target_columns)

                                                                                

In [26]:
repair_base_df = model._prepare_repair_base_cells('adult', error_cells_df, target_columns)

                                                                                

In [27]:
import altair as alt

charts = []
pdf = repair_base_df.toPandas()

for c in [c for c in pdf.columns if c != 'tid']:
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [28]:
target = 'Sex'

In [29]:
pdf = repair_base_df.toPandas()
pdf = pdf.dropna()
X = pdf.drop(['tid', target], axis=1).reset_index(drop=True)
y = pdf[target].reset_index(drop=True)

In [30]:
import category_encoders as ce
se = ce.OrdinalEncoder(handle_unknown='impute')
X = se.fit_transform(X)
X

Unnamed: 0,Age,Education,Occupation,Relationship,Country,Income
0,1,1,1,1,1,1
1,2,1,2,2,1,1
2,1,2,3,1,1,1
3,1,3,4,3,1,1
4,3,1,5,2,1,1
5,2,3,6,1,1,1
6,2,4,4,1,1,1
7,4,3,1,3,1,1
8,1,5,2,4,1,1
9,4,1,3,2,1,1


In [31]:
import altair as alt

pdf = pd.concat([X, y], axis=1)

alt.Chart(pdf).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color=f'{target}:N'
).properties(width=200, height=200).repeat(row=X.columns.tolist(), column=X.columns.tolist())

In [32]:
# One of non-linear embedding in sklearn
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
_X = tsne.fit_transform(X)
tsne.kl_divergence_

0.3068719804286957

In [33]:
import altair as alt

_X = pd.DataFrame({'tSNE-X': _X[:, 0], 'tSNE-Y': _X[:, 1], target: y})
alt.Chart(_X).mark_point().encode(x='tSNE-X', y='tSNE-Y', color=f'{target}:N').properties(width=600, height=400).interactive()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

rf = RandomForestClassifier(n_jobs=-1, max_depth=5)
rf.fit(X, y)
print('SCORE with ALL Features: %1.2f\n' % rf.score(X, y))

rf = RandomForestClassifier(n_jobs=-1, max_depth=5)
fs = BorutaPy(rf, n_estimators='auto', random_state=0)
fs.fit(X.values, y.values)

selected = fs.support_
print('Selected Features: %s' % ','.join(X.columns[selected]))

X_selected = X[X.columns[selected]]
rf = RandomForestClassifier(n_jobs=-1, max_depth=5)
rf.fit(X_selected, y)
print('SCORE with selected Features: %1.2f' % rf.score(X_selected, y))