In [None]:
from splink import DuckDBAPI, block_on
from splink.blocking_analysis import n_largest_blocks
from splink.exploratory import profile_columns

import splink.comparison_library as cl
from splink import SettingsCreator
from splink import Linker

import pandas as pd

## Import Datasets
These Datasets Created from the presentation Layer models 

In [None]:
df_dbusa = pd.read_csv("/content/dbusa_presentation_202410302126.csv",
                       dtype={"zip_code" : str})
df_reonomy = pd.read_csv("/content/reonomy_presentation_202410302125.csv",
                       dtype={"zip_code" : str})

## EDA
Doing some Exploratory Data Analysis to find the biggest natural clusters(blocks) in the two datasets and the outcomes of different blocking rules

In [None]:
"""
Finding Top 10 natural Clusters
"""

result = n_largest_blocks(table_or_tables=[df_dbusa , df_reonomy],
    blocking_rule= br,
    link_type="link_only",
    db_api=db_api,
    n_largest=10
    )

result.as_pandas_dataframe()

Unnamed: 0,key_0,key_1,key_2,key_3,count_l,count_r,block_count
0,CO,denver,denver,80202,182,231,42042
1,IL,cook,chicago,60611,76,85,6460
2,WA,king,bellevue,98004,246,16,3936
3,AZ,maricopa,mesa,85206,118,31,3658
4,NV,clark,henderson,89052,224,16,3584
5,NY,new york,new york,10019,179,17,3043
6,IN,lake,munster,46321,66,44,2904
7,IN,marion,indianapolis,46260,145,19,2755
8,NY,new york,new york,10013,133,19,2527
9,AZ,maricopa,scottsdale,85260,88,25,2200


In [None]:
"""
Discovering the impact of different blocking rules
 in the number of pairwise comparisons
"""

from splink.blocking_analysis import (
    cumulative_comparisons_to_be_scored_from_blocking_rules_chart,
)

blocking_rules_for_analysis = [
    block_on("state" , "city" , "address","zip_code"),
    block_on("state", "county" , "city" , "address","zip_code"),
    block_on("state", "county" , "city" , "zip_code"),
    block_on("state", "county" , "city" ),
    block_on("state", "county" ),

]


cumulative_comparisons_to_be_scored_from_blocking_rules_chart(
    table_or_tables=[df_dbusa , df_reonomy],
    blocking_rules=blocking_rules_for_analysis,
    db_api=db_api,
    link_type="link_only",
    unique_id_column_name = "property_id"
)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
"""
Discover Combinations of columns found the two Datasets 
"""

profile_columns([df_dbusa , df_reonomy],
                column_expressions=["state || city || zip_code"],
                db_api=db_api)

## Configuring the linker Settings
The settings Consist of:
- blocking rule : specify which pairwise comparisons to generate by block the state property we can stert discovering similarity in the others property
- comparisons: The properties that the linkage model will start to compare with .I choosed Jaro Winkler Alghorithm because it's good for comparing smaller strings like words and names

The blocking rule is decided by knowing the nature of the dataset so After a lot of iterations I found that blocking in State only gives the least number of clusters

In [None]:


settings = SettingsCreator(
    link_type="link_only",
    unique_id_column_name="property_id",
    blocking_rules_to_generate_predictions= [block_on("state")],
    retain_intermediate_calculation_columns=True,
    comparisons = [
        cl.JaroWinklerAtThresholds("county"),
        cl.JaroWinklerAtThresholds("city"),
        cl.JaroWinklerAtThresholds("address"),
        cl.JaroWinklerAtThresholds("zip_code"),
    ]
)


In [None]:
"""
Definig the linker with DuckDB Backend
"""
linker = Linker([df_dbusa , df_reonomy], settings, db_api=DuckDBAPI())

In [None]:
"""
Training the u parameter
"""
linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

INFO:splink.internals.estimate_u:----- Estimating u probabilities using random sampling -----
INFO:splink.internals.estimate_u:
Estimated u probabilities using random sampling
INFO:splink.internals.settings:
Your model is not yet fully trained. Missing estimates for:
    - county (no m values are trained).
    - city (no m values are trained).
    - address (no m values are trained).
    - zip_code (no m values are trained).


In [None]:
"""
Training the m parameter by blocking in the state only 
"""
training_session_state_county = (
    linker.training.estimate_parameters_using_expectation_maximisation(block_on("state",  ))
)

INFO:splink.internals.em_training_session:
----- Starting EM training session -----

INFO:splink.internals.em_training_session:Estimating the m probabilities of the model by blocking on:
l."state" = r."state"

Parameter estimates will be made for the following comparison(s):
    - county
    - city
    - address
    - zip_code

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 1: Largest change in params was -0.947 in the m_probability of address, level `Exact match on address`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 2: Largest change in params was -0.346 in the m_probability of city, level `Exact match on city`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 3: Largest change in params was 0.166 in the m_probability of city, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 4: Largest change in params was -0.0531 in the m_probability of county, level `Exact match on county`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 5: Largest change in params was -0.0855 in the m_probability of county, level `Exact match on county`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 6: Largest change in params was -0.11 in the m_probability of county, level `Exact match on county`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 7: Largest change in params was -0.124 in the m_probability of county, level `Exact match on county`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 8: Largest change in params was 0.123 in the m_probability of county, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 9: Largest change in params was 0.0884 in the m_probability of county, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 10: Largest change in params was 0.0463 in the m_probability of county, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 11: Largest change in params was 0.0201 in the m_probability of county, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 12: Largest change in params was 0.0109 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 13: Largest change in params was 0.00989 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 14: Largest change in params was 0.00817 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 15: Largest change in params was 0.0065 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 16: Largest change in params was 0.00511 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 17: Largest change in params was 0.004 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 18: Largest change in params was 0.00314 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 19: Largest change in params was 0.00247 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 20: Largest change in params was 0.00194 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 21: Largest change in params was 0.00153 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 22: Largest change in params was 0.00121 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 23: Largest change in params was 0.000957 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 24: Largest change in params was 0.000758 in the m_probability of zip_code, level `All other comparisons`


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

INFO:splink.internals.expectation_maximisation:Iteration 25: Largest change in params was 0.000601 in the m_probability of zip_code, level `All other comparisons`
INFO:splink.internals.expectation_maximisation:
EM converged after 25 iterations
INFO:splink.internals.settings:
Your model is fully trained. All comparisons have at least one estimate for their m and u values


### u and m parameters graphs for every property comparison

In [None]:
"""
This graph shows that the model easily found u & m parameters for every property 
except the zip_code property as some records didn't match with the Jaro Winkler Alghorithm
"""

linker.visualisations.match_weights_chart()

In [None]:
linker.visualisations.m_u_parameters_chart()

## Saving the model for later analysis

In [None]:
settings = linker.misc.save_model_to_json(
    "state_county_blocking_model.json", overwrite=True
)