In [2]:
from pathlib import Path

ROOT = Path.cwd()

DATA_DIR = ROOT / "parquet"
MLDS_DIR = ROOT / "ml-datasets"
OUTPUT_DIR = ROOT / "output"
LOGS = ROOT / "logs"

LOGS.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
from PyDI.io import load_parquet

metacritic = load_parquet(
    DATA_DIR / "df_metacritic.parquet",
    name="metacritic"
)

playtime = load_parquet(
    DATA_DIR / "df_playtime.parquet",
    name="playtime"
)

vgsales = load_parquet(
  DATA_DIR / "df_videogamesales.parquet",
  name="videogamesales"
)

In [4]:
metacritic.head()

Unnamed: 0,title,platform,release_year,developer,genres,critic_score,user_score,esrb_rating
0,Portal 2,Xbox 360,2011,Valve Software,"[Action, Shooter, First-Person, Sci-Fi, Arcade]",95.0,8.8,E10+
1,Metal Gear Solid V: The Phantom Pain,Xbox One,2015,Konami,"[Modern, Action Adventure, Open-World]",95.0,7.5,M
2,Red Dead Redemption,Xbox 360,2010,Rockstar Games,"[Action, Action Adventure, Shooter, Historic, ...",95.0,9.0,M
3,Portal 2,PC,2011,Valve Software,"[Action, Shooter, First-Person, Sci-Fi, Arcade]",95.0,9.1,E10+
4,The Last of Us Remastered,PlayStation 4,2014,SCEA,"[Action Adventure, General, Modern]",95.0,9.2,M


In [5]:
playtime.head()

Unnamed: 0,title,platform,release_year,developer,publisher,genres,main_story_hour,main_plus_sides_hour,completionist_hour
0,688(I) Hunter/Killer,PC,1997.0,Sonalysts,Electronic Arts,[Simulation],10.62,35.37,15.83
1,'Splosion Man,Xbox 360,2009.0,Twisted Pixel Games,Microsoft Games Studios,[Action],7.6,9.23,18.77
2,.hack//G.U. Vol. 1: Rebirth,Nintendo Switch,2006.0,CyberConnect2,Bandai,"[Action, Role-Playing]",18.95,24.5,37.4
3,.hack//G.U. Vol. 1: Rebirth,PC,2006.0,CyberConnect2,Bandai,"[Action, Role-Playing]",18.95,24.5,37.4
4,.hack//G.U. Vol. 1: Rebirth,PlayStation 2,2006.0,CyberConnect2,Bandai,"[Action, Role-Playing]",18.95,24.5,37.4


In [6]:
vgsales.head()

Unnamed: 0,title,platform,release_year,publisher,genres,na_sales_mil,eu_sales_mil,jp_sales_mil,other_sales_mil,global_sales_mil
0,Wii Sports,Wii,2006.0,Nintendo,[Sports],41.49,29.02,3.77,8.46,82.74
1,Super Mario Bros.,NES,1985.0,Nintendo,[Platform],29.08,3.58,6.81,0.77,40.24
2,Mario Kart Wii,Wii,2008.0,Nintendo,[Racing],15.85,12.88,3.79,3.31,35.82
3,Wii Sports Resort,Wii,2009.0,Nintendo,[Sports],15.75,11.01,3.28,2.96,33.0
4,Pokemon Red/Pokemon Blue,GB,1996.0,Nintendo,[Role-Playing],11.27,8.89,10.22,1.0,31.37


In [7]:
vgsales.platform.unique()

array(['Wii', 'NES', 'GB', 'Nintendo DS', 'Xbox 360', 'PlayStation 3',
       'PlayStation 2', 'Super Nintendo', 'Game Boy Advance',
       'Nintendo 3DS', 'PlayStation 4', 'N64', 'PS', 'XB', 'PC', '2600',
       'PSP', 'Xbox One', 'Nintendo GameCube', 'WiiU', 'GEN', 'DC',
       'PlayStation Vita', 'SAT', 'SCD', 'WS', 'NG', 'TG16', '3DO', 'GG',
       'PCFX'], dtype=object)

In [8]:
import pandas as pd

def add_row_id(df: pd.DataFrame, prefix: str, start: int = 1, colname: str = "id"):
    seq = range(start, start + len(df))
    ids = [f"{prefix}_{i}" for i in seq]
    out = df.copy()
    out.insert(0, colname, ids)
    return out

In [9]:
metacritic = add_row_id(metacritic, "metacritic")
playtime = add_row_id(playtime, "playtime")
vgsales = add_row_id(vgsales, "sales")

In [10]:
import logging


logging.basicConfig(
    level=logging.INFO,
    format='[%(levelname)-5s] %(name)s - %(message)s',
    handlers=[
          logging.FileHandler(LOGS / 'pydi.log'),
          logging.StreamHandler()
      ],
    force=True
)

### Dataset Summary

In [11]:
from PyDI.profiling import DataProfiler


datasets = [metacritic, playtime, vgsales]
names = ["metacritic", "playtime", "videogamesales"]

# Initialize the DataProfiler
profiler = DataProfiler()

for df, name in zip(datasets, names):
    profile = profiler.summary(df)

display(profile)

metacritic:
  Rows: 14,666
  Columns: 9
  Total nulls: 2,862
  Null percentage: 2.2%
  Null counts per column:
    critic_score: 7 (0.0%)
    user_score: 805 (5.5%)
    esrb_rating: 2,050 (14.0%)

playtime:
  Rows: 97,231
  Columns: 10
  Total nulls: 116,314
  Null percentage: 12.0%
  Null counts per column:
    platform: 785 (0.8%)
    release_year: 12,431 (12.8%)
    developer: 12,234 (12.6%)
    publisher: 14,518 (14.9%)
    main_story_hour: 13,493 (13.9%)
    main_plus_sides_hour: 34,489 (35.5%)
    completionist_hour: 28,364 (29.2%)

videogamesales:
  Rows: 16,598
  Columns: 11
  Total nulls: 329
  Null percentage: 0.2%
  Null counts per column:
    release_year: 271 (1.6%)
    publisher: 58 (0.3%)



{'rows': 16598,
 'columns': 11,
 'nulls_total': 329,
 'nulls_per_column': {'id': 0,
  'title': 0,
  'platform': 0,
  'release_year': 271,
  'publisher': 58,
  'genres': 0,
  'na_sales_mil': 0,
  'eu_sales_mil': 0,
  'jp_sales_mil': 0,
  'other_sales_mil': 0,
  'global_sales_mil': 0},
 'dtypes': {'id': 'object',
  'title': 'object',
  'platform': 'object',
  'release_year': 'float64',
  'publisher': 'object',
  'genres': 'object',
  'na_sales_mil': 'float64',
  'eu_sales_mil': 'float64',
  'jp_sales_mil': 'float64',
  'other_sales_mil': 'float64',
  'global_sales_mil': 'float64'}}

### Attribute Coverage

In [12]:
coverage = profiler.analyze_coverage(
    datasets=datasets,
    include_samples=True,
    sample_count=3
)

print("ðŸ“Š Attribute coverage across datasets:")
display(coverage)

# Identify attributes suitable for entity matching
print("\nðŸ”— Attributes suitable for entity matching:")
matching_attrs = coverage[coverage['datasets_with_attribute'] >= 2]['attribute'].tolist()
print(f"Attributes available in 2+ datasets: {matching_attrs}")

[INFO ] PyDI.fusion.analysis - Analyzed 18 attributes across 3 datasets


ðŸ“Š Attribute coverage across datasets:


Unnamed: 0,attribute,metacritic_count,metacritic_pct,metacritic_coverage,metacritic_samples,playtime_count,playtime_pct,playtime_coverage,playtime_samples,videogamesales_count,videogamesales_pct,videogamesales_coverage,videogamesales_samples,avg_coverage,max_coverage,datasets_with_attribute
0,completionist_hour,0/0,0%,0.0,,68867/97231,70.8%,0.708282,"[15.83, 18.77, 37.4]",0/0,0%,0.0,,0.236094,0.708282,1
1,critic_score,14659/14666,100.0%,0.999523,"[95.0, 95.0, 95.0]",0/0,0%,0.0,,0/0,0%,0.0,,0.333174,0.999523,1
2,developer,14666/14666,100.0%,1.0,"['Valve Software', 'Konami', 'Rockstar Games']",84997/97231,87.4%,0.874176,"['Sonalysts', 'Twisted Pixel Games', 'CyberCon...",0/0,0%,0.0,,0.624725,1.0,2
3,esrb_rating,12616/14666,86.0%,0.860221,"['E10+', 'M', 'M']",0/0,0%,0.0,,0/0,0%,0.0,,0.28674,0.860221,1
4,eu_sales_mil,0/0,0%,0.0,,0/0,0%,0.0,,16598/16598,100.0%,1.0,"[29.02, 3.58, 12.88]",0.333333,1.0,1
5,genres,14666/14666,100.0%,1.0,"[array(['Action', 'Shooter', 'First-Person', '...",97231/97231,100.0%,1.0,"[array(['Simulation'], dtype=object), array(['...",16598/16598,100.0%,1.0,"[array(['Sports'], dtype=object), array(['Plat...",1.0,1.0,3
6,global_sales_mil,0/0,0%,0.0,,0/0,0%,0.0,,16598/16598,100.0%,1.0,"[82.74, 40.24, 35.82]",0.333333,1.0,1
7,id,14666/14666,100.0%,1.0,"['metacritic_1', 'metacritic_2', 'metacritic_3']",97231/97231,100.0%,1.0,"['playtime_1', 'playtime_2', 'playtime_3']",16598/16598,100.0%,1.0,"['sales_1', 'sales_2', 'sales_3']",1.0,1.0,3
8,jp_sales_mil,0/0,0%,0.0,,0/0,0%,0.0,,16598/16598,100.0%,1.0,"[3.77, 6.81, 3.79]",0.333333,1.0,1
9,main_plus_sides_hour,0/0,0%,0.0,,62742/97231,64.5%,0.645288,"[35.37, 9.23, 24.5]",0/0,0%,0.0,,0.215096,0.645288,1



ðŸ”— Attributes suitable for entity matching:
Attributes available in 2+ datasets: ['developer', 'genres', 'id', 'platform', 'publisher', 'release_year', 'title']


### Entity Matching

In [13]:
BLOCK_EVAL_DIR = OUTPUT_DIR / "blocking_evaluation"
CORR_DIR = OUTPUT_DIR / "correspondences"

BLOCK_EVAL_DIR.mkdir(parents=True, exist_ok=True)
CORR_DIR.mkdir(parents=True, exist_ok=True)

In [14]:
from PyDI.entitymatching import (StandardBlocker,
                                 SortedNeighbourhoodBlocker,
                                 TokenBlocker,
                                 EmbeddingBlocker,
                                 RuleBasedMatcher,
                                 StringComparator,
                                 NumericComparator,
                                 EntityMatchingEvaluator)

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
train_p2m = load_parquet(
    MLDS_DIR / "train_PM.parquet",
    name="train_playtime_metacritic",
    add_index=False
)

test_p2m = load_parquet(
    MLDS_DIR / "test_PM.parquet",
    name="test_playtime_metacritic",
    add_index=False
)

train_p2s = load_parquet(
    MLDS_DIR / "train_PS.parquet",
    name="train_playtime_sales",
    add_index=False
)

test_p2s = load_parquet(
    MLDS_DIR / "test_PS.parquet",
    name="test_playtime_sales",
    add_index=False
)

In [16]:
train_p2m.sample(5)

Unnamed: 0,id_left,id_right,label
82,playtime_33112,metacritic_6825,0
723,playtime_1035,metacritic_5598,1
397,playtime_6528,metacritic_7788,0
588,playtime_48158,metacritic_5961,1
304,playtime_41882,metacritic_10614,0


In [17]:
train_p2m = train_p2m.rename(columns={"id_left": "id1", "id_right": "id2"})
test_p2m = test_p2m.rename(columns={"id_left": "id1", "id_right": "id2"})
train_p2s = train_p2s.rename(columns={"id_left": "id1", "id_right": "id2"})
test_p2s = test_p2s.rename(columns={"id_left": "id1", "id_right": "id2"})

### Blocking
#### Playtime -> Metacritic

In [18]:
st_blocker_p2m = StandardBlocker(
    playtime, metacritic,
    on=['title','developer'],
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

standard_candidates_p2m = st_blocker_p2m.materialize()

sn_blocker_p2m = SortedNeighbourhoodBlocker(
    playtime, metacritic,
    key='title',
    window=20,
    batch_size=750,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

sn_candidates_p2m = sn_blocker_p2m.materialize()


token_blocker_p2m = TokenBlocker(
    playtime, metacritic,
    column='title',
    batch_size=500,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id',
    ngram_size=3,
    ngram_type='character'
)
#token_candidates_p2m = token_blocker_p2m.materialize()



embedding_blocker_p2m = EmbeddingBlocker(
    playtime, metacritic,
    text_cols=['title', 'developer'],
    model="sentence-transformers/all-MiniLM-L6-v2",
    index_backend="sklearn",
    top_k=10,
    batch_size=500,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

#embedding_candidates_p2m = embedding_blocker_p2m.materialize()

[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 48978 blocking keys for first dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 10405 blocking keys for second dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 1476 blocks from blocking keys
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - Debug results written to file: /Users/abd/Developer/wdi-project/output/blocking_evaluation/debugResultsBlocking_StandardBlocker.csv
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created sorted neighbourhood with window size 20
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created 1 sorted sequence from 111897 records
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - Debug results written to file: /Users/abd/Developer/wdi-project/output/blocking_evaluation/debugResultsBlocking_SortedNeighb

#### Playtime -> Sales

In [19]:
st_blocker_p2s = StandardBlocker(
    playtime, vgsales,
    on=['title','publisher'],
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

standard_candidates_p2s = st_blocker_p2s.materialize()

sn_blocker_p2s = SortedNeighbourhoodBlocker(
    playtime, vgsales,
    key='title',
    window=20,
    batch_size=750,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

sn_candidates_p2s = sn_blocker_p2s.materialize()


token_blocker_p2s = TokenBlocker(
    playtime, vgsales,
    column='title',
    batch_size=500,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id',
    ngram_size=3,
    ngram_type='character'
)
#token_candidates_p2s = token_blocker_p2s.materialize()



embedding_blocker_p2s = EmbeddingBlocker(
    playtime, vgsales,
    text_cols=['title', 'publisher'],
    model="sentence-transformers/all-MiniLM-L6-v2",
    index_backend="sklearn",
    top_k=10,
    batch_size=500,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

#embedding_candidates_p2s = embedding_blocker_p2s.materialize()

[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 48978 blocking keys for first dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 11917 blocking keys for second dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 2731 blocks from blocking keys
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - Debug results written to file: /Users/abd/Developer/wdi-project/output/blocking_evaluation/debugResultsBlocking_StandardBlocker.csv
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created sorted neighbourhood with window size 20
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created 1 sorted sequence from 113829 records
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - Debug results written to file: /Users/abd/Developer/wdi-project/output/blocking_evaluation/debugResultsBlocking_SortedNeighb

### Evaluate Blocking
#### Playtime -> Metacritic

In [20]:
evaluator = EntityMatchingEvaluator()

p2m_blocking_candidates = {
    'StandardBlocking': [standard_candidates_p2m, st_blocker_p2m],
    'SortedNeighbourhoodBlocker': [sn_candidates_p2m, sn_blocker_p2m],
    #'TokenBlocking': [token_candidates_p2m, token_blocker_p2m],
    #'EmbeddingBlocking': [embedding_candidates_p2m, embedding_blocker_p2m]
}

In [21]:
p2m_results = []
for method_name, candidates in p2m_blocking_candidates.items():
    result = evaluator.evaluate_blocking(candidates[0],
                                         test_p2m,
                                         candidates[1],
                                         out_dir=BLOCK_EVAL_DIR)
    result['method'] = method_name
    result['dataset'] = 'p2m'
    p2m_results.append(result)

p2m_best = max(p2m_results, key=lambda x: (x['pair_completeness'], x['reduction_ratio']))
print(f"Best blocking for p2m: {p2m_best['method']} (PC: {p2m_best['pair_completeness']:.3f}, RR: {p2m_best['reduction_ratio']:.3f})")

[INFO ] root -   Pair Completeness: 0.106
[INFO ] root -   Pair Quality:      0.001
[INFO ] root -   Reduction Ratio:   0.999995
[INFO ] root -   True Matches Found: 10/94
[INFO ] root - Blocking evaluation complete!
[INFO ] root -   Pair Completeness: 1.000
[INFO ] root -   Pair Quality:      0.000
[INFO ] root -   Reduction Ratio:   0.999668
[INFO ] root -   True Matches Found: 94/94
[INFO ] root - Blocking evaluation complete!


Best blocking for p2m: SortedNeighbourhoodBlocker (PC: 1.000, RR: 1.000)


#### Playtime -> Sales

In [22]:
p2s_blocking_candidates = {
    'StandardBlocking': [standard_candidates_p2s, st_blocker_p2s],
    'SortedNeighbourhoodBlocker': [sn_candidates_p2s, sn_blocker_p2s],
    #'TokenBlocking': [token_candidates_p2s, token_blocker_p2s],
    #'EmbeddingBlocking': [embedding_candidates_p2s, embedding_blocker_p2s]
}

In [23]:
p2s_results = []
for method_name, candidates in p2s_blocking_candidates.items():
    result = evaluator.evaluate_blocking(candidates[0],
                                         test_p2s,
                                         candidates[1],
                                         out_dir=BLOCK_EVAL_DIR)
    result['method'] = method_name
    result['dataset'] = 'p2m'
    p2s_results.append(result)

p2s_best = max(p2s_results, key=lambda x: (x['pair_completeness'], x['reduction_ratio']))
print(f"Best blocking for p2m: {p2s_best['method']} (PC: {p2s_best['pair_completeness']:.3f}, RR: {p2s_best['reduction_ratio']:.3f})")

[INFO ] root -   Pair Completeness: 0.253
[INFO ] root -   Pair Quality:      0.002
[INFO ] root -   Reduction Ratio:   0.999991
[INFO ] root -   True Matches Found: 24/95
[INFO ] root - Blocking evaluation complete!
[INFO ] root -   Pair Completeness: 0.874
[INFO ] root -   Pair Quality:      0.000
[INFO ] root -   Reduction Ratio:   0.999691
[INFO ] root -   True Matches Found: 83/95
[INFO ] root - Blocking evaluation complete!


Best blocking for p2m: SortedNeighbourhoodBlocker (PC: 0.874, RR: 1.000)


### Rule Based Matcher

In [24]:
comparators = [
    StringComparator(
        column='title',
        similarity_function='jaccard',
        preprocess=str.lower
    ),

    StringComparator(
        column='platform',
        similarity_function='jaccard',
        preprocess=str.lower
    ),
    
    NumericComparator(
        column='release_year',
        max_difference=1
    ),
    
    StringComparator(
        column='genres',
        similarity_function='jaccard',
        preprocess=str.lower,
        list_strategy='concatenate'
    )
]

In [25]:
matcher = RuleBasedMatcher()

correspondences_p2m = matcher.match(
    df_left=playtime,
    df_right=metacritic, 
    candidates=sn_blocker_p2m,
    comparators=comparators,
    weights=[0.4, 0.2, 0.2, 0.2],
    threshold=0.7,
    id_column='id'
)

correspondences_p2s = matcher.match(
    df_left=playtime,
    df_right=vgsales, 
    candidates=sn_blocker_p2s,
    comparators=comparators,
    weights=[0.5, 0.1, 0.2, 0.2],
    threshold=0.7,
    id_column='id'
)

[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Blocking 97231 x 14666 elements
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Matching 97231 x 14666 elements after 0:00:0.211; 473866 blocked pairs (reduction ratio: 0.9996676932859451)
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Entity Matching finished after 0:00:49.010; found 13409 correspondences.
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Blocking 97231 x 16598 elements
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Matching 97231 x 16598 elements after 0:00:0.199; 498960 blocked pairs (reduction ratio: 0.999690824395644)
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Entity Matching finished after 0:00:51.772; found 24966 correspondences.


### RB Matching Evaluation
#### Playtime -> Metacritic

In [26]:
debug_output_dir = OUTPUT_DIR / "debug_results_entity_matching"
debug_output_dir.mkdir(parents=True, exist_ok=True)

eval_results_p2m = EntityMatchingEvaluator.evaluate_matching(
    correspondences=correspondences_p2m,
    test_pairs=test_p2m,
    out_dir=debug_output_dir
)

display(eval_results_p2m)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  88
[INFO ] root -   True Negatives:  105
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 6
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.970
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.936
[INFO ] root -   F1-Score:  0.967


{'precision': 1.0,
 'recall': 0.9361702127659575,
 'f1': 0.967032967032967,
 'accuracy': 0.9698492462311558,
 'true_positives': 88,
 'false_positives': 0,
 'false_negatives': 6,
 'true_negatives': 105,
 'threshold_used': 0.0,
 'total_correspondences': 13409,
 'filtered_correspondences': 13409,
 'evaluation_timestamp': '2025-11-20T19:56:00.530564',
 'output_files': ['/Users/abd/Developer/wdi-project/output/debug_results_entity_matching/matching_evaluation_summary.json',
  '/Users/abd/Developer/wdi-project/output/debug_results_entity_matching/matching_detailed_results.csv']}

#### Playtime -> Sales

In [27]:
eval_results_p2s = EntityMatchingEvaluator.evaluate_matching(
    correspondences=correspondences_p2s,
    test_pairs=test_p2s,
    out_dir=debug_output_dir
)

display(eval_results_p2s)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  69
[INFO ] root -   True Negatives:  104
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 26
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.869
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.726
[INFO ] root -   F1-Score:  0.841


{'precision': 1.0,
 'recall': 0.7263157894736842,
 'f1': 0.8414634146341463,
 'accuracy': 0.8693467336683417,
 'true_positives': 69,
 'false_positives': 0,
 'false_negatives': 26,
 'true_negatives': 104,
 'threshold_used': 0.0,
 'total_correspondences': 24966,
 'filtered_correspondences': 24966,
 'evaluation_timestamp': '2025-11-20T19:56:06.440611',
 'output_files': ['/Users/abd/Developer/wdi-project/output/debug_results_entity_matching/matching_evaluation_summary.json',
  '/Users/abd/Developer/wdi-project/output/debug_results_entity_matching/matching_detailed_results.csv']}

### ML Based Matcher

In [55]:
from PyDI.entitymatching import FeatureExtractor

comparators = [

    StringComparator(column='title', similarity_function='jaccard', preprocess=str.lower),
    StringComparator(column='title', similarity_function='cosine', preprocess=str.lower),
    StringComparator(column='title', similarity_function='jaro', preprocess=str.lower),
    StringComparator(column='title', similarity_function='jaro_winkler', preprocess=str.lower),
    

    StringComparator(column='platform', similarity_function='jaccard', preprocess=str.lower),
    StringComparator(column='platform', similarity_function='cosine', preprocess=str.lower),
    StringComparator(column='platform', similarity_function='jaro', preprocess=str.lower),

    NumericComparator(column='release_year',max_difference=1),
    StringComparator(column='genres', similarity_function='jaccard', preprocess=str.lower, list_strategy='concatenate'),
    StringComparator(column='genres', similarity_function='jaccard', preprocess=str.lower, list_strategy='best_match')

]

feature_extractor = FeatureExtractor(comparators)


train_features_p2m = feature_extractor.create_features(
    playtime, metacritic, train_p2m[['id1', 'id2']], labels=train_p2m['label'], id_column='id'
)

train_features_p2s = feature_extractor.create_features(
    playtime, vgsales, train_p2s[['id1', 'id2']], labels=train_p2s['label'], id_column='id'
)

feature_columns_p2m = [col for col in train_features_p2m.columns if col not in ['id1', 'id2', 'label']]
X_train_p2m = train_features_p2m[feature_columns_p2m]
y_train_p2m = train_features_p2m['label']

feature_columns_p2s = [col for col in train_features_p2s.columns if col not in ['id1', 'id2', 'label']]
X_train_p2s = train_features_p2s[feature_columns_p2s]
y_train_p2s = train_features_p2s['label']

training_datasets = [(X_train_p2m, y_train_p2m),(X_train_p2s, y_train_p2s)]

[INFO ] root - Label distribution: 375 positive, 420 negative
[INFO ] root - Label distribution: 378 positive, 418 negative


In [56]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score

# classifiers
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# Define parameter grids
param_grids = {
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'class_weight': ['balanced', None],
        'min_samples_split': [2, 5]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    }
}


scorer = make_scorer(f1_score)
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_models = []


for data in training_datasets:
    grid_search_results = {}
    best_overall_score = -1
    best_overall_model = None
    best_model_name = None

    for name, model in classifiers.items():
        print(f"Running GridSearchCV for {name}...")
        
        grid = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            scoring=scorer,
            cv=cv_folds,
            n_jobs=-1,
            verbose=0
        )
        
        grid.fit(data[0], data[1])
        
        grid_search_results[name] = {
            'grid_search': grid,
            'best_score': grid.best_score_,
            'best_params': grid.best_params_,
            'best_estimator': grid.best_estimator_
        }

        if grid.best_score_ > best_overall_score:
            best_overall_model = grid.best_estimator_

    best_models.append(best_overall_model)

Running GridSearchCV for RandomForestClassifier...
Running GridSearchCV for GradientBoostingClassifier...
Running GridSearchCV for SVC...
Running GridSearchCV for LogisticRegression...
Running GridSearchCV for RandomForestClassifier...
Running GridSearchCV for GradientBoostingClassifier...
Running GridSearchCV for SVC...
Running GridSearchCV for LogisticRegression...


In [57]:
display(best_models)

[LogisticRegression(C=1, max_iter=1000, random_state=42),
 LogisticRegression(C=10, max_iter=1000, random_state=42)]

In [58]:
from PyDI.entitymatching import MLBasedMatcher

ml_matcher = MLBasedMatcher(feature_extractor)

correspondences_p2m = ml_matcher.match(
    playtime, metacritic,
    candidates=sn_blocker_p2m,
    id_column='id',
    trained_classifier=best_models[0]
)

correspondences_p2s = ml_matcher.match(
    playtime, vgsales,
    candidates=sn_blocker_p2s,
    id_column='id',
    trained_classifier=best_models[1]
)

[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Blocking 97231 x 14666 elements
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Matching 97231 x 14666 elements after 0:00:0.519; 473866 blocked pairs (reduction ratio: 0.9996676932859451)
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Entity Matching finished after 0:00:116.500; found 92177 correspondences.
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Blocking 97231 x 16598 elements
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Matching 97231 x 16598 elements after 0:00:0.230; 498960 blocked pairs (reduction ratio: 0.999690824395644)
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Entity Matching finished after 0:00:104.537; found 186259 correspondences.


### ML Matching Evaluation
#### Playtime -> Metacritic

In [59]:
eval_p2m = EntityMatchingEvaluator.evaluate_matching(
                correspondences=correspondences_p2m,
                test_pairs=test_p2m,
                out_dir=BLOCK_EVAL_DIR,
            )

display(eval_p2m)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  92
[INFO ] root -   True Negatives:  105
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 2
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.990
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.979
[INFO ] root -   F1-Score:  0.989


{'precision': 1.0,
 'recall': 0.9787234042553191,
 'f1': 0.989247311827957,
 'accuracy': 0.9899497487437185,
 'true_positives': 92,
 'false_positives': 0,
 'false_negatives': 2,
 'true_negatives': 105,
 'threshold_used': 0.0,
 'total_correspondences': 92177,
 'filtered_correspondences': 92177,
 'evaluation_timestamp': '2025-11-20T22:26:10.903571',
 'output_files': ['/Users/abd/Developer/wdi-project/output/blocking_evaluation/matching_evaluation_summary.json',
  '/Users/abd/Developer/wdi-project/output/blocking_evaluation/matching_detailed_results.csv']}

#### Playtime -> Sales

In [60]:
eval_p2s = EntityMatchingEvaluator.evaluate_matching(
                correspondences=correspondences_p2s,
                test_pairs=test_p2s,
                out_dir=BLOCK_EVAL_DIR,
            )

display(eval_p2s)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  82
[INFO ] root -   True Negatives:  104
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 13
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.935
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.863
[INFO ] root -   F1-Score:  0.927


{'precision': 1.0,
 'recall': 0.8631578947368421,
 'f1': 0.9265536723163842,
 'accuracy': 0.9346733668341709,
 'true_positives': 82,
 'false_positives': 0,
 'false_negatives': 13,
 'true_negatives': 104,
 'threshold_used': 0.0,
 'total_correspondences': 186259,
 'filtered_correspondences': 186259,
 'evaluation_timestamp': '2025-11-20T22:26:37.645403',
 'output_files': ['/Users/abd/Developer/wdi-project/output/blocking_evaluation/matching_evaluation_summary.json',
  '/Users/abd/Developer/wdi-project/output/blocking_evaluation/matching_detailed_results.csv']}

### Data Fusion

In [61]:
import numpy as np

datasets = [playtime, metacritic, vgsales]

for i, df in enumerate(datasets):
    df["genres"] = df["genres"].apply(
        lambda x: x.tolist() if isinstance(x, np.ndarray) else x
    )
    datasets[i] = df

In [62]:
from PyDI.fusion import (DataFusionStrategy,
                         DataFusionEngine,
                         longest_string,
                         union,
                         prefer_higher_trust,
                         voting)



playtime.attrs["trust_score"] = 3
metacritic.attrs["trust_score"] = 2
vgsales.attrs["trust_score"] = 1

# merge rule based correspondences
all_ml_correspondences = pd.concat([correspondences_p2m, correspondences_p2s], ignore_index=True)

# define data fusion strategy
strategy = DataFusionStrategy('video_games_fusion_strategy')

strategy.add_attribute_fuser('title', longest_string)
strategy.add_attribute_fuser('developer', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('publisher', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('release_year', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('platform', voting)
strategy.add_attribute_fuser('genres', union)

# run fusion
engine = DataFusionEngine(strategy, debug=True, debug_format='json',
                          debug_file="output/data_fusion/debug_fusion_ml_standard_blocker.jsonl")

# fuse rule based matches
ml_fused_sn_blocker = engine.run(
    datasets=datasets,
    correspondences=all_ml_correspondences,
    id_column="id",
    include_singletons=False,
)

[INFO ] PyDI.fusion.strategy - Registered fuser for attribute 'title' using rule 'longest_string'
[INFO ] PyDI.fusion.strategy - Registered fuser for attribute 'developer' using rule 'prefer_higher_trust'
[INFO ] PyDI.fusion.strategy - Registered fuser for attribute 'publisher' using rule 'prefer_higher_trust'
[INFO ] PyDI.fusion.strategy - Registered fuser for attribute 'release_year' using rule 'prefer_higher_trust'
[INFO ] PyDI.fusion.strategy - Registered fuser for attribute 'platform' using rule 'voting'
[INFO ] PyDI.fusion.strategy - Registered fuser for attribute 'genres' using rule 'union'
[INFO ] PyDI.fusion.engine - Fusion debug logging enabled; refer to output/data_fusion/debug_fusion_ml_standard_blocker.jsonl for detailed traces.
[INFO ] PyDI.fusion.engine - Starting data fusion with strategy 'video_games_fusion_strategy'
[INFO ] PyDI.fusion.engine - *    Loading correspondences    *
[INFO ] PyDI.fusion.engine - Correspondence ID coverage: matched 75468 of 75468 unique IDs


In [63]:
print(f'Fused rows: {len(ml_fused_sn_blocker):,}')

Fused rows: 5,719


In [64]:
ml_fused_sn_blocker.sample(5)

Unnamed: 0,_id,_fusion_sources,_fusion_source_datasets,genres,main_story_hour,publisher,critic_score,platform,esrb_rating,completionist_hour,...,title,developer,release_year,_fusion_confidence,_fusion_metadata,na_sales_mil,jp_sales_mil,eu_sales_mil,other_sales_mil,global_sales_mil
1403,playtime_58227,"[playtime_58227, playtime_58226, metacritic_1349]","[playtime, playtime, metacritic]","[2D, Action, Platform, Platformer]",9.78,Capcom,78.0,Nintendo Switch,E10+,14.53,...,Shinsekai: Into the Depths,Capcom,2020.0,0.541667,"{'genres_rule': 'union', 'genres_sources': ['m...",,,,,
1127,metacritic_376,"[metacritic_376, playtime_45713, playtime_4571...","[metacritic, playtime, playtime, playtime, pla...","[Action Adventure, Adventure, General, Real-Ti...",29.1,Failbetter Games,87.0,PC,,129.83,...,Sunless Skies,Failbetter Games,2017.0,0.445833,"{'genres_rule': 'union', 'genres_sources': ['m...",,,,,
4629,metacritic_7653,"[metacritic_7653, playtime_53070, playtime_530...","[metacritic, playtime, playtime, playtime]","[2D, Action, Platform, Platformer, Puzzle, Side]",3.22,Graffiti Games,70.0,Switch,E,31.43,...,Joggernauts,Space Mace,2018.0,0.458333,"{'genres_rule': 'union', 'genres_sources': ['m...",,,,,
2178,playtime_32342,"[playtime_32342, sales_15455, metacritic_11377...","[playtime, videogamesales, metacritic, metacri...","[Action, Arcade, First-Person, Historic, Shooter]",4.9,2K Games,61.0,PC,M,8.92,...,Vietcong (2003),Pterodon Illusion Softworks,2005.0,0.598039,"{'publisher_rule': 'prefer_higher_trust', 'pub...",0.0,0.0,0.02,0.0,0.02
2445,metacritic_4343,"[metacritic_4343, metacritic_3418, playtime_40...","[metacritic, metacritic, playtime, videogamesa...","[Action, Action RPG, Role-Playing, Third-Person]",37.28,Nihon Falcom,85.0,PC,T,71.32,...,Ys VI: The Ark of Napishtim,Nihon Falcom,2017.0,0.469583,"{'publisher_rule': 'prefer_higher_trust', 'pub...",0.0,0.09,0.0,0.0,0.09


In [65]:
ml_fused_sn_blocker.to_parquet(OUTPUT_DIR / "data_fusion" / "fused_dataset.parquet", index=False)