In [1]:
package_jar = '../target/spark-data-repair-plugin_2.12_spark3.1_0.1.0-EXPERIMENTAL-with-dependencies.jar'

In [2]:
import numpy as np
import pandas as pd
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql import functions as f

spark = SparkSession.builder \
    .config('spark.jars', package_jar) \
    .config('spark.deriver.memory', '8g') \
    .enableHiveSupport() \
    .getOrCreate()

# Suppresses user warinig messages in Python
import warnings
warnings.simplefilter("ignore", UserWarning)

# Suppresses `WARN` messages in JVM
spark.sparkContext.setLogLevel("ERROR")

In [3]:
from repair.api import Scavenger
Scavenger().version()

'0.1.0-spark3.1-EXPERIMENTAL'

In [4]:
spark.read.option("header", True).csv("../testdata/adult.csv").createOrReplaceTempView("adult")
spark.table('adult').printSchema()

root
 |-- tid: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Education: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Relationship: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Income: string (nullable = true)



In [6]:
import altair as alt

charts = []
pdf = spark.table('adult').toPandas()
cols = ['Age', 'Education', 'Occupation', 'Relationship', 'Sex', 'Country', 'Income']

for c in cols:
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [9]:
from repair.detectors import NullErrorDetector, ConstraintErrorDetector
error_detectors = [ 
    ConstraintErrorDetector(constraint_path="../testdata/adult_constraints.txt"),
    NullErrorDetector()
]

from repair.model import RepairModel
model = RepairModel().setTableName('adult').setRowId('tid').setDiscreteThreshold(100) 
noisy_cells_df, noisy_columns = model.setErrorDetectors(error_detectors)._detect_errors('adult', 8, 20)

In [11]:
import altair as alt

pdf = noisy_cells_df.toPandas()
alt.Chart(pdf).mark_bar().encode(x=alt.X('attribute'), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=400, height=400)

In [12]:
discretized_table, discretized_columns, distinct_stats = model._discretize_attrs('adult')
discretized_columns

['Age', 'Education', 'Occupation', 'Relationship', 'Sex', 'Country', 'Income']

In [14]:
target_columns = list(filter(lambda c: c in discretized_columns, noisy_columns))
target_columns

['Income', 'Age', 'Relationship', 'Sex']

In [15]:
cell_domain, pairwise_stats = model._analyze_error_cell_domain(noisy_cells_df, discretized_table, [], target_columns, discretized_columns, 20)

In [16]:
import altair as alt

charts = []

for target, cols in pairwise_stats.items():
    pdf = pd.DataFrame(cols, columns=[target, 'cor'])
    pdf['cor'] = pdf['cor'].astype('float')
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(target), y=alt.Y('cor')).properties(width=200, height=200))
    
alt.hconcat(*charts)

In [17]:
error_cells_df, weak_labeled_cells_df_opt = model._extract_error_cells(noisy_cells_df, cell_domain, 20, 8)

In [20]:
repair_base_df = model._prepare_repair_base_cells('adult', noisy_cells_df, target_columns, 20, 8)
repair_base_df = model._repair_attrs(weak_labeled_cells_df_opt, repair_base_df)

In [22]:
import altair as alt

charts = []
pdf = repair_base_df.toPandas()
cols = ['Age', 'Education', 'Occupation', 'Relationship', 'Sex', 'Country', 'Income']

for c in cols:
    charts.append(alt.Chart(pdf).mark_bar().encode(x=alt.X(c), y=alt.Y('count()', axis=alt.Axis(title='freq'))).properties(width=300, height=300))

alt.hconcat(*charts)

In [97]:
target = 'Sex'

In [112]:
pdf = repair_base_df.toPandas()
# pdf = pdf[pdf[target].notna()]
pdf = pdf.dropna()
X = pdf.drop(['tid', target], axis=1).reset_index(drop=True)
y = pdf[target].reset_index(drop=True)

In [113]:
import category_encoders as ce
se = ce.OrdinalEncoder(handle_unknown='impute')
X = se.fit_transform(X)

In [114]:
X.describe()

Unnamed: 0,Age,Education,Occupation,Relationship,Country,Income
count,13.0,13.0,13.0,13.0,13.0,13.0
mean,1.846154,2.769231,3.538462,1.923077,1.076923,1.076923
std,1.143544,1.589227,1.853617,1.037749,0.27735,0.27735
min,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,2.0,1.0,1.0,1.0
50%,1.0,3.0,3.0,2.0,1.0,1.0
75%,2.0,3.0,5.0,3.0,1.0,1.0
max,4.0,6.0,7.0,4.0,2.0,2.0


In [115]:
from sklearn import preprocessing

mm = preprocessing.MinMaxScaler()
norm_data = mm.fit_transform(X)
X = pd.DataFrame(norm_data, columns=X.columns)

In [116]:
pdf = pd.concat([X, y], axis=1)

In [117]:
import altair as alt

alt.Chart(pdf).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color=f'{target}:N'
).properties(width=200, height=200).repeat(row=X.columns.tolist(), column=X.columns.tolist())

In [118]:
import altair as alt
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
_X = pca.fit_transform(X)
_X = pd.DataFrame({'PCA-X': _X[:, 0], 'PCA-Y': _X[:, 1], target: y})
alt.Chart(_X).mark_point().encode(x='PCA-X', y='PCA-Y', color=f'{target}:N').properties(width=600, height=400).interactive()

In [122]:
# One of non-linear embedding in sklearn
from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)
_X = tsne.fit_transform(X)
tsne.kl_divergence_

0.3068719804286957

In [123]:
import altair as alt

_X = pd.DataFrame({'tSNE-X': _X[:, 0], 'tSNE-Y': _X[:, 1], target: y})
alt.Chart(_X).mark_point().encode(x='tSNE-X', y='tSNE-Y', color=f'{target}:N').properties(width=600, height=400).interactive()

In [135]:
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

_y = y.replace(dict(map(lambda v: (v[1], v[0]), enumerate(y.unique()))))

rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
rf.fit(X, _y)
print('SCORE with ALL Features: %1.2f\n' % rf.score(X, _y))

rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
fs = BorutaPy(rf, n_estimators='auto', random_state=0)
fs.fit(X.values, _y.values)

selected = fs.support_
print('Selected Features: %s' % ','.join(X.columns[selected]))

X_selected = X[X.columns[selected]]
rf = RandomForestRegressor(n_jobs=-1, max_depth=5)
rf.fit(X_selected, _y)
print('SCORE with selected Features: %1.2f' % rf.score(X_selected, _y))

SCORE with ALL Features: 0.82

[False False False False False False]
Selected Features: 


ValueError: at least one array or dtype is required