In [None]:
%autoreload 2

In [None]:
import logging 
logging.basicConfig()
log = logging.getLogger("blocking").setLevel("DEBUG")
log = logging.getLogger("sql").setLevel("DEBUG")
log = logging.getLogger("gammas").setLevel("DEBUG")
log = logging.getLogger("expectation_step").setLevel("DEBUG")
log = logging.getLogger("maximisation_step").setLevel("DEBUG")
log = logging.getLogger("comparison_evaluation").setLevel("DEBUG")

In [None]:
from pyspark.context import SparkContext, SparkConf
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import StructType
import pyspark.sql.functions as f

# WARNING:
# These config options are appropriate only if you're running Spark locally!!!
conf=SparkConf()
conf.set('spark.driver.memory', '8g')
conf.set("spark.sql.shuffle.partitions", "8") 

sc = SparkContext.getOrCreate(conf=conf)

sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
import sys 
sys.path.append("sparklink")
from sql import *
from blocking import *
from gammas import *
from params import *
from expectation_step import *
from maximisation_step import *
from iterate import *
from comparison_evaluation import *


In [None]:
df = spark.read.parquet("data/fake_100.parquet")
df = df.dropDuplicates()
df.show(3)

In [None]:
blocking_rules = []

blocking_rules.append('l.first_name = r.first_name  and  l.dob = r.dob')
blocking_rules.append('l.first_name = r.first_name  and  l.city = r.city')
blocking_rules.append('l.first_name = r.first_name  and  l.email = r.email')

blocking_rules.append('l.surname = r.surname  and  l.dob = r.dob')
blocking_rules.append('l.surname = r.surname  and  l.city = r.city')
blocking_rules.append('l.surname = r.surname  and  l.email = r.email')

blocking_rules.append('l.dob = r.dob  and  l.city = r.city')
blocking_rules.append('l.dob = r.dob  and  l.email = r.email')

blocking_rules.append('l.city = r.city  and  l.email = r.email')

In [None]:
df_comparison = block_using_rules(df, blocking_rules, spark=spark)
df_comparison = cartestian_block(df, spark=spark)

In [None]:

from gammas import *

gamma_settings = {
    "first_name": {
        "levels": 3,
        "case_expression": gammas_case_statement_3_levels("first_name", 0)
    },
    "surname": {
        "levels": 3
    },
    "dob": {
        "levels": 2
    },
    "city": {
        "levels": 2
    },
    "email": {
        "levels": 2
    }
}

df_gammas = add_gammas(df_comparison, gamma_settings, spark, include_orig_cols = False)
params = Params(gamma_settings, starting_lambda=0.027)

params.set_pi_value("gamma_0", 0, "non_match", 0.57)
params.set_pi_value("gamma_0", 1, "non_match", 0.42)
params.set_pi_value("gamma_0", 2, "non_match", 1-0.57-0.42)

params.set_pi_value("gamma_0", 0, "match", 0.25)
params.set_pi_value("gamma_0", 1, "match", 0.4)
params.set_pi_value("gamma_0", 2, "match", 1-0.25-0.4)

params.set_pi_value("gamma_1", 0, "non_match", 0.57)
params.set_pi_value("gamma_1", 1, "non_match", 0.42)
params.set_pi_value("gamma_1", 2, "non_match", 1-0.57-0.42)

params.set_pi_value("gamma_1", 0, "match", 0.25)
params.set_pi_value("gamma_1", 1, "match", 0.4)
params.set_pi_value("gamma_1", 2, "match", 1-0.25-0.4)



In [None]:
sql = sql_gen_intermediate_pi_aggregate(params)


df_intermediate = spark.sql(sql)
log.debug(format_sql(sql))
df_intermediate.registerTempTable("df_intermediate")
df_intermediate.persist()

new_lambda = get_new_lambda(df_e,  spark)
pi_df_collected = get_new_pi_df(df_e, spark, params)

pi_df_collected


In [None]:
# gamma_cols = "gamma_1"
# sql = f"""
# select {gamma_cols}, sum(match_probability) as expected_num_matches, sum(1- match_probability) as expected_num_non_matches, count(*) as num_rows
# from df_e
# group by {gamma_cols}

# """
# spark.sql(sql).show()

In [None]:
# spark.sql("select * from df_intermediate").show()

In [None]:

df_e = iterate(df_gammas, spark, params, num_iterations=20)

In [None]:
get_real_params(df_comparison, df_gammas, spark, params)

In [None]:
params.lambda_iteration_chart()

In [None]:
params.pi_iteration_chart()

In [None]:
params.lambda_iteration_chart()

In [None]:
params.pi_iteration_chart()

In [None]:
df_e.registerTempTable('df_e')
df_comparison.registerTempTable('df_comparison')

sql = """
select e.match_probability, c.*
from df_e as e
left join df_comparison as c
on e.unique_id_l = c.unique_id_l
and
e.unique_id_r = c.unique_id_r

"""
spark.sql(sql).filter().sample(0.1).show(100)

In [None]:
params

In [None]:
df_comparison.show()

In [None]:
## Compute real values 
df_labelled = df_comparison.withColumn("label", (df_comparison["group_l"] == df_comparison["group_r"]).cast("integer"))
df_labelled.show()

In [None]:
df_labelled.registerTempTable("df_labelled")
sql = """
select sum(label)/count(*)
from df_labelled
"""
spark.sql(sql).collect()[0][0]

In [None]:
df_e.show()

In [None]:
df_e.registerTempTable('df_e')
df_comparison.registerTempTable('df_comparison')

# Want match probability, gammas, label
gamma_select_expr = ", ".join([f"e.{c}" for c in  params.gamma_cols])


sql = f"""
select {gamma_select_expr}, e.match_probability,
cast(c.group_l == c.group_r as int) as label
from df_e as e
left join df_comparison as c
on e.unique_id_l = c.unique_id_l
and
e.unique_id_r = c.unique_id_r

"""
gamma_e_label = spark.sql(sql)
gamma_e_label.show()

In [None]:
gamma_e_label.registerTempTable("gamma_e_label")
gamma_avg_expr = ", ".join([f"avg({c})" for c in  params.gamma_cols])

number_matches = gamma_e_label.filter(gamma_e_label["label"]==1).count()
number_non_matches = gamma_e_label.filter(gamma_e_label["label"]==0).count()

sql = f"""

select label, gamma_0, count(gamma_0)/{number_matches}  as proportion_match, count(gamma_0)/{number_non_matches}  as proportion_non_match
from gamma_e_label
group by label, gamma_0
"""
spark.sql(sql).show()

In [None]:
# Re-write this to work in the same way as df_intermediate in the max step

# Ideally, the column names would come out the same as they do from the maximiastion step.  Then we could treat this as an 'iteration' and pass it into params.


gamma_e_label.registerTempTable("gamma_e_label")

sql = f"""
select label, gamma_0, count(*) as count_records
from gamma_e_label
group by label, gamma_0
"""
intermediate = spark.sql(sql)

intermediate.registerTempTable("intermediate")

intermediate.show()

In [None]:
sql = """
select *, count_records/sum(count_records) over (partition by label) as proportion
from intermediate
"""
spark.sql(sql).show()


In [None]:
df_e.registerTempTable('df_e')
df_comparison.registerTempTable('df_comparison')

# Want match probability, gammas, label
gamma_select_expr = ", ".join([f"e.{c}" for c in  params.gamma_cols])


sql = f"""
select {gamma_select_expr}, 
cast(c.group_l == c.group_r as int) as match_probability
from df_e as e
left join df_comparison as c
on e.unique_id_l = c.unique_id_l
and
e.unique_id_r = c.unique_id_r

"""
gamma_e_label = spark.sql(sql)
gamma_e_label.show()

In [None]:
gamma_e_label.filter(gamma_e_label.match_probability>0.95).show(100)

In [None]:
params