In [334]:
from pathlib import Path
import pandas as pd
pd.set_option("display.max_columns", None) 

ROOT = Path.cwd()

DATA_DIR = ROOT / "parquet"
MLDS_DIR = ROOT / "ml-datasets"
OUTPUT_DIR = ROOT / "output"
LOGS = ROOT / "logs"

LOGS.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
from PyDI.io import load_parquet
import re, unicodedata

metacritic = load_parquet(
    DATA_DIR / "df_metacritic.parquet",
    name="metacritic"
)

playtime = load_parquet(
    DATA_DIR / "df_playtime.parquet",
    name="playtime"
)

vgsales = load_parquet(
  DATA_DIR / "df_videogamesales.parquet",
  name="videogamesales"

)



def normalize_title(text):
    if not isinstance(text, str):
        return ""
    # remove accents/diacritics (Ã± -> n)
    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")
    cleaned = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    return cleaned.lower().strip()

metacritic["title_norm"] = metacritic["title"].apply(normalize_title)
playtime["title_norm"] = playtime["title"].apply(normalize_title)
vgsales["title_norm"] = vgsales["title"].apply(normalize_title)

In [336]:
import logging


logging.basicConfig(
    level=logging.INFO,
    format='[%(levelname)-5s] %(name)s - %(message)s',
    handlers=[
          logging.FileHandler(LOGS / 'pydi.log'),
          logging.StreamHandler()
      ],
    force=True
)

### Dataset Summary

In [337]:
from PyDI.profiling import DataProfiler


datasets = [metacritic, playtime, vgsales]
names = ["metacritic", "playtime", "videogamesales"]

# Initialize the DataProfiler
profiler = DataProfiler()

for df, name in zip(datasets, names):
    profile = profiler.summary(df)

display(profile)

metacritic:
  Rows: 14,666
  Columns: 10
  Total nulls: 2,862
  Null percentage: 2.0%
  Null counts per column:
    critic_score: 7 (0.0%)
    user_score: 805 (5.5%)
    esrb_rating: 2,050 (14.0%)

playtime:
  Rows: 84,461
  Columns: 11
  Total nulls: 81,864
  Null percentage: 8.8%
  Null counts per column:
    developer: 9,904 (11.7%)
    publisher: 11,090 (13.1%)
    main_story_hour: 11,934 (14.1%)
    main_plus_sides_hour: 27,144 (32.1%)
    completionist_hour: 21,792 (25.8%)

videogamesales:
  Rows: 16,327
  Columns: 12
  Total nulls: 36
  Null percentage: 0.0%
  Null counts per column:
    publisher: 36 (0.2%)



{'rows': 16327,
 'columns': 12,
 'nulls_total': 36,
 'nulls_per_column': {'id': 0,
  'title': 0,
  'platform': 0,
  'release_year': 0,
  'publisher': 36,
  'genres': 0,
  'na_sales_mil': 0,
  'eu_sales_mil': 0,
  'jp_sales_mil': 0,
  'other_sales_mil': 0,
  'global_sales_mil': 0,
  'title_norm': 0},
 'dtypes': {'id': 'object',
  'title': 'object',
  'platform': 'object',
  'release_year': 'float64',
  'publisher': 'object',
  'genres': 'object',
  'na_sales_mil': 'float64',
  'eu_sales_mil': 'float64',
  'jp_sales_mil': 'float64',
  'other_sales_mil': 'float64',
  'global_sales_mil': 'float64',
  'title_norm': 'object'}}

### Attribute Coverage

In [338]:
coverage = profiler.analyze_coverage(
    datasets=datasets,
    include_samples=True,
    sample_count=3
)

print("ðŸ“Š Attribute coverage across datasets:")
display(coverage)

# Identify attributes suitable for entity matching
print("\nðŸ”— Attributes suitable for entity matching:")
matching_attrs = coverage[coverage['datasets_with_attribute'] >= 2]['attribute'].tolist()
print(f"Attributes available in 2+ datasets: {matching_attrs}")

[INFO ] PyDI.fusion.analysis - Analyzed 19 attributes across 3 datasets


ðŸ“Š Attribute coverage across datasets:


Unnamed: 0,attribute,metacritic_count,metacritic_pct,metacritic_coverage,metacritic_samples,playtime_count,playtime_pct,playtime_coverage,playtime_samples,videogamesales_count,videogamesales_pct,videogamesales_coverage,videogamesales_samples,avg_coverage,max_coverage,datasets_with_attribute
0,completionist_hour,0/0,0%,0.0,,62669/84461,74.2%,0.741987,"[15.83, 18.77, 37.4]",0/0,0%,0.0,,0.247329,0.741987,1
1,critic_score,14659/14666,100.0%,0.999523,"[95.0, 95.0, 95.0]",0/0,0%,0.0,,0/0,0%,0.0,,0.333174,0.999523,1
2,developer,14666/14666,100.0%,1.0,"['Valve Software', 'Konami', 'Rockstar Games']",74557/84461,88.3%,0.882739,"['Sonalysts', 'Twisted Pixel Games', 'CyberCon...",0/0,0%,0.0,,0.62758,1.0,2
3,esrb_rating,12616/14666,86.0%,0.860221,"['E10+', 'M', 'M']",0/0,0%,0.0,,0/0,0%,0.0,,0.28674,0.860221,1
4,eu_sales_mil,0/0,0%,0.0,,0/0,0%,0.0,,16327/16327,100.0%,1.0,"[29.02, 3.58, 12.88]",0.333333,1.0,1
5,genres,14666/14666,100.0%,1.0,"[array(['Action', 'Shooter', 'First-Person', '...",84461/84461,100.0%,1.0,"[array(['Simulation'], dtype=object), array(['...",16327/16327,100.0%,1.0,"[array(['Sports'], dtype=object), array(['Plat...",1.0,1.0,3
6,global_sales_mil,0/0,0%,0.0,,0/0,0%,0.0,,16327/16327,100.0%,1.0,"[82.74, 40.24, 35.82]",0.333333,1.0,1
7,id,14666/14666,100.0%,1.0,"['metacritic_1', 'metacritic_2', 'metacritic_3']",84461/84461,100.0%,1.0,"['playtime_1', 'playtime_2', 'playtime_3']",16327/16327,100.0%,1.0,"['sales_1', 'sales_2', 'sales_3']",1.0,1.0,3
8,jp_sales_mil,0/0,0%,0.0,,0/0,0%,0.0,,16327/16327,100.0%,1.0,"[3.77, 6.81, 3.79]",0.333333,1.0,1
9,main_plus_sides_hour,0/0,0%,0.0,,57317/84461,67.9%,0.678621,"[35.37, 9.23, 24.5]",0/0,0%,0.0,,0.226207,0.678621,1



ðŸ”— Attributes suitable for entity matching:
Attributes available in 2+ datasets: ['developer', 'genres', 'id', 'platform', 'publisher', 'release_year', 'title', 'title_norm']


### Entity Matching

In [339]:
BLOCK_EVAL_DIR = OUTPUT_DIR / "blocking_evaluation"
CORR_DIR = OUTPUT_DIR / "correspondences"

BLOCK_EVAL_DIR.mkdir(parents=True, exist_ok=True)
CORR_DIR.mkdir(parents=True, exist_ok=True)

In [340]:
from PyDI.entitymatching import (StandardBlocker,
                                 SortedNeighbourhoodBlocker,
                                 TokenBlocker,
                                 EmbeddingBlocker,
                                 RuleBasedMatcher,
                                 StringComparator,
                                 NumericComparator,
                                 EntityMatchingEvaluator)

In [341]:
train_p2m = load_parquet(
    MLDS_DIR / "train_PM.parquet",
    name="train_playtime_metacritic",
    add_index=False
)

test_p2m = load_parquet(
    MLDS_DIR / "test_PM.parquet",
    name="test_playtime_metacritic",
    add_index=False
)

train_p2s = load_parquet(
    MLDS_DIR / "train_PS.parquet",
    name="train_playtime_sales",
    add_index=False
)

test_p2s = load_parquet(
    MLDS_DIR / "test_PS.parquet",
    name="test_playtime_sales",
    add_index=False
)
train_p2m = train_p2m.rename(columns={"id_left": "id1", "id_right": "id2"})
test_p2m = test_p2m.rename(columns={"id_left": "id1", "id_right": "id2"})
train_p2s = train_p2s.rename(columns={"id_left": "id1", "id_right": "id2"})
test_p2s = test_p2s.rename(columns={"id_left": "id1", "id_right": "id2"})

### Blocking
#### Playtime -> Metacritic

In [342]:
st_blocker_p2m = StandardBlocker(
    playtime, metacritic,
    on=['title_norm'],
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

standard_candidates_p2m = st_blocker_p2m.materialize()

sn_blocker_p2m = SortedNeighbourhoodBlocker(
    playtime, metacritic,
    key='title_norm',
    window=20,
    batch_size=750,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

sn_candidates_p2m = sn_blocker_p2m.materialize()



embedding_blocker_p2m = EmbeddingBlocker(
    playtime, metacritic,
    text_cols=['title_norm', 'platform','release_year'],
    model="sentence-transformers/all-MiniLM-L6-v2",
    index_backend="sklearn",
    top_k=10,
    batch_size=1000,
    threshold=0.7,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

embedding_candidates_p2m = embedding_blocker_p2m.materialize()

[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 42000 blocking keys for first dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 9863 blocking keys for second dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 6769 blocks from blocking keys
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - Debug results written to file: /Users/onurcanmemis/Desktop/web-data-integration-team-project/output/blocking_evaluation/debugResultsBlocking_StandardBlocker.csv
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created sorted neighbourhood with window size 20
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created 1 sorted sequence from 99127 records
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - Debug results written to file: /Users/onurcanmemis/Desktop/web-data-integration-team-project/outp

#### Playtime -> Sales

In [343]:
st_blocker_p2s = StandardBlocker(
    playtime, vgsales,
    on=['title_norm'],
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

standard_candidates_p2s = st_blocker_p2s.materialize()

sn_blocker_p2s = SortedNeighbourhoodBlocker(
    playtime, vgsales,
    key='title_norm',
    window=20,
    batch_size=750,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

sn_candidates_p2s = sn_blocker_p2s.materialize()



embedding_blocker_p2s = EmbeddingBlocker(
    playtime, vgsales,
    text_cols=['title_norm', 'platform','release_year'],
    model="sentence-transformers/all-MiniLM-L6-v2",
    index_backend="sklearn",
    top_k=10,
    threshold=0.7,
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

embedding_candidates_p2s = embedding_blocker_p2s.materialize()

[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 42000 blocking keys for first dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 11332 blocking keys for second dataset
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - created 5720 blocks from blocking keys
[INFO ] PyDI.entitymatching.blocking.standard.StandardBlocker - Debug results written to file: /Users/onurcanmemis/Desktop/web-data-integration-team-project/output/blocking_evaluation/debugResultsBlocking_StandardBlocker.csv
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created sorted neighbourhood with window size 20
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - created 1 sorted sequence from 100788 records
[INFO ] PyDI.entitymatching.blocking.sorted_neighbourhood.SortedNeighbourhoodBlocker - Debug results written to file: /Users/onurcanmemis/Desktop/web-data-integration-team-project/ou

### Evaluate Blocking
#### Playtime -> Metacritic

In [344]:
evaluator = EntityMatchingEvaluator()

p2m_blocking_candidates = {
    'StandardBlocking': [standard_candidates_p2m, st_blocker_p2m],
    'SortedNeighbourhoodBlocker': [sn_candidates_p2m, sn_blocker_p2m],
    'EmbeddingBlocking': [embedding_candidates_p2m, embedding_blocker_p2m]
}

In [345]:
p2m_results = []
for method_name, candidates in p2m_blocking_candidates.items():
    result = evaluator.evaluate_blocking(candidates[0],
                                         train_p2m,
                                         candidates[1],
                                         out_dir=BLOCK_EVAL_DIR)
    result['method'] = method_name
    result['dataset'] = 'p2m'
    p2m_results.append(result)

p2m_best = max(p2m_results, key=lambda x: (x['pair_completeness'], x['reduction_ratio']))
print(f"Best blocking for p2m: {p2m_best['method']} (PC: {p2m_best['pair_completeness']:.3f}, RR: {p2m_best['reduction_ratio']:.3f})")

[INFO ] root -   Pair Completeness: 0.947
[INFO ] root -   Pair Quality:      0.010
[INFO ] root -   Reduction Ratio:   1.000
[INFO ] root -   True Matches Found: 355/375
[INFO ] root - Blocking evaluation complete!
[INFO ] root -   Pair Completeness: 0.989
[INFO ] root -   Pair Quality:      0.001
[INFO ] root -   Reduction Ratio:   1.000
[INFO ] root -   True Matches Found: 371/375
[INFO ] root - Blocking evaluation complete!
[INFO ] root -   Pair Completeness: 0.995
[INFO ] root -   Pair Quality:      0.002
[INFO ] root -   Reduction Ratio:   1.000
[INFO ] root -   True Matches Found: 373/375
[INFO ] root - Blocking evaluation complete!


Best blocking for p2m: EmbeddingBlocking (PC: 0.995, RR: 1.000)


#### Playtime -> Sales

In [346]:
p2s_blocking_candidates = {
    'StandardBlocking': [standard_candidates_p2s, st_blocker_p2s],
    'SortedNeighbourhoodBlocker': [sn_candidates_p2s, sn_blocker_p2s],
    #'TokenBlocking': [token_candidates_p2s, token_blocker_p2s],
    'EmbeddingBlocking': [embedding_candidates_p2s, embedding_blocker_p2s]
}

In [347]:
p2s_results = []
for method_name, candidates in p2s_blocking_candidates.items():
    result = evaluator.evaluate_blocking(candidates[0],
                                         train_p2s,
                                         candidates[1],
                                         out_dir=BLOCK_EVAL_DIR)
    result['method'] = method_name
    result['dataset'] = 'p2m'
    p2s_results.append(result)

p2s_best = max(p2s_results, key=lambda x: (x['pair_completeness'], x['reduction_ratio']))
print(f"Best blocking for p2s: {p2s_best['method']} (PC: {p2s_best['pair_completeness']:.3f}, RR: {p2s_best['reduction_ratio']:.3f})")

[INFO ] root -   Pair Completeness: 0.749
[INFO ] root -   Pair Quality:      0.009
[INFO ] root -   Reduction Ratio:   1.000
[INFO ] root -   True Matches Found: 283/378
[INFO ] root - Blocking evaluation complete!
[INFO ] root -   Pair Completeness: 0.884
[INFO ] root -   Pair Quality:      0.001
[INFO ] root -   Reduction Ratio:   1.000
[INFO ] root -   True Matches Found: 334/378
[INFO ] root - Blocking evaluation complete!
[INFO ] root -   Pair Completeness: 0.955
[INFO ] root -   Pair Quality:      0.002
[INFO ] root -   Reduction Ratio:   1.000
[INFO ] root -   True Matches Found: 361/378
[INFO ] root - Blocking evaluation complete!


Best blocking for p2s: EmbeddingBlocking (PC: 0.955, RR: 1.000)


### Rule Based Matcher

In [348]:
common_comparators= [   
    StringComparator(
        column='title_norm',
        similarity_function='cosine'
    ),
    StringComparator(
        column='platform',
        similarity_function='identity',
        preprocess=str.lower
    ),
    NumericComparator(
    column="release_year",
    method="within_range",
    max_difference=0,   # exact match only
)]
comparators_p2m = common_comparators + [
    StringComparator(
        column='developer',
        similarity_function='cosine',
        preprocess=str.lower
    )
]
comparators_p2s= common_comparators + [
    StringComparator(
        column='publisher',
        similarity_function='jaccard',
        preprocess=str.lower
    )
]

In [349]:
matcher = RuleBasedMatcher()

correspondences_p2m ,debug_p2m= matcher.match(
    df_left=playtime,
    df_right=metacritic, 
    candidates=embedding_candidates_p2m,
    comparators=comparators_p2m,
    weights=[0.5, 0.2, 0.2, 0.1],
    threshold=0.80,
    id_column='id',
    debug=True
)

correspondences_p2s,debug_p2s = matcher.match(
    df_left=playtime,
    df_right=vgsales, 
    candidates=embedding_candidates_p2s,
    comparators=comparators_p2s,
    weights=[0.5, 0.2, 0.2, 0.1],
    threshold=0.80,
    id_column='id',
    debug=True
)

[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Blocking 84461 x 14666 elements
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Matching 84461 x 14666 elements after 0:00:0.035; 152428 blocked pairs (reduction ratio: 0.999876945683758)
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Entity Matching finished after 0:00:17.222; found 9094 correspondences.
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Blocking 84461 x 16327 elements
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Matching 84461 x 16327 elements after 0:00:0.022; 152151 blocked pairs (reduction ratio: 0.9998896652794864)
[INFO ] PyDI.entitymatching.rule_based.RuleBasedMatcher - Entity Matching finished after 0:00:16.464; found 6617 correspondences.


In [350]:
debug_p2s.iloc[500:510]

Unnamed: 0,id1,id2,comparator_name,record1_value,record2_value,record1_preprocessed,record2_preprocessed,similarity,postprocessed_similarity
500,playtime_24,sales_8777,"StringComparator(title_norm, cosine)",007 racing,enthusia professional racing,007 racing,enthusia professional racing,0.408248,0.408248
501,playtime_24,sales_8777,"StringComparator(platform, identity)",PlayStation,PlayStation 2,playstation,playstation 2,0.0,0.0
502,playtime_24,sales_8777,"NumericComparator(release_year, within_range)",2000.0,2005.0,2000.0,2005.0,0.0,0.0
503,playtime_24,sales_8777,"StringComparator(publisher, jaccard)",Electronic Arts,Konami Digital Entertainment,electronic arts,konami digital entertainment,0.0,0.0
504,playtime_24,sales_10008,"StringComparator(title_norm, cosine)",007 racing,f1 racing championship,007 racing,f1 racing championship,0.408248,0.408248
505,playtime_24,sales_10008,"StringComparator(platform, identity)",PlayStation,PS,playstation,ps,0.0,0.0
506,playtime_24,sales_10008,"NumericComparator(release_year, within_range)",2000.0,2000.0,2000.0,2000.0,1.0,1.0
507,playtime_24,sales_10008,"StringComparator(publisher, jaccard)",Electronic Arts,Video System,electronic arts,video system,0.0,0.0
508,playtime_24,sales_2928,"StringComparator(title_norm, cosine)",007 racing,nascar 2001,007 racing,nascar 2001,0.0,0.0
509,playtime_24,sales_2928,"StringComparator(platform, identity)",PlayStation,PlayStation 2,playstation,playstation 2,0.0,0.0


In [351]:
cluster_analysis_dir = OUTPUT_DIR / "cluster_analysis"
cluster_analysis_dir.mkdir(parents=True, exist_ok=True)
cluster_distribution_p2m = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=correspondences_p2m,
    out_dir=cluster_analysis_dir
)
cluster_distribution_p2s = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=correspondences_p2s,
    out_dir=cluster_analysis_dir
)

[INFO ] root - Cluster Size Distribution of 8731 clusters:
[INFO ] root - 	Cluster Size	| Frequency	| Percentage
[INFO ] root - 	â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[INFO ] root - 		2	|	8497	|	97.32%
[INFO ] root - 		3	|	169	|	1.94%
[INFO ] root - 		4	|	42	|	0.48%
[INFO ] root - 		5	|	20	|	0.23%
[INFO ] root - 		6	|	1	|	0.01%
[INFO ] root - 		7	|	2	|	0.02%
[INFO ] root - Cluster size distribution written to /Users/onurcanmemis/Desktop/web-data-integration-team-project/output/cluster_analysis/cluster_size_distribution.csv
[INFO ] root - Cluster Size Distribution of 6232 clusters:
[INFO ] root - 	Cluster Size	| Frequency	| Percentage
[INFO ] root - 	â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[INFO ] root - 		2	|	5999	|	96.26%
[INFO ] root - 		3	|	169	|	2.71%
[INFO ] ro

### RB Matching Evaluation
#### Playtime -> Metacritic

In [352]:
debug_output_dir = OUTPUT_DIR / "debug_results_entity_matching"
debug_output_dir.mkdir(parents=True, exist_ok=True)

eval_results_p2m = EntityMatchingEvaluator.evaluate_matching(
    correspondences=correspondences_p2m,
    test_pairs=test_p2m,
    out_dir=debug_output_dir
)

display(eval_results_p2m)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  90
[INFO ] root -   True Negatives:  104
[INFO ] root -   False Positives: 1
[INFO ] root -   False Negatives: 4
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.975
[INFO ] root -   Precision: 0.989
[INFO ] root -   Recall:    0.957
[INFO ] root -   F1-Score:  0.973


{'precision': 0.989010989010989,
 'recall': 0.9574468085106383,
 'f1': 0.972972972972973,
 'accuracy': 0.9748743718592965,
 'true_positives': 90,
 'false_positives': 1,
 'false_negatives': 4,
 'true_negatives': 104,
 'threshold_used': 0.0,
 'total_correspondences': 9094,
 'filtered_correspondences': 9094,
 'evaluation_timestamp': '2025-11-23T00:40:24.555816',
 'output_files': ['/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/debug_results_entity_matching/matching_evaluation_summary.json',
  '/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/debug_results_entity_matching/matching_detailed_results.csv']}

#### Playtime -> Sales

In [353]:
eval_results_p2s = EntityMatchingEvaluator.evaluate_matching(
    correspondences=correspondences_p2s,
    test_pairs=test_p2s,
    out_dir=debug_output_dir
)

display(eval_results_p2s)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  84
[INFO ] root -   True Negatives:  104
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 11
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.945
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.884
[INFO ] root -   F1-Score:  0.939


{'precision': 1.0,
 'recall': 0.8842105263157894,
 'f1': 0.9385474860335195,
 'accuracy': 0.9447236180904522,
 'true_positives': 84,
 'false_positives': 0,
 'false_negatives': 11,
 'true_negatives': 104,
 'threshold_used': 0.0,
 'total_correspondences': 6617,
 'filtered_correspondences': 6617,
 'evaluation_timestamp': '2025-11-23T00:40:24.903536',
 'output_files': ['/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/debug_results_entity_matching/matching_evaluation_summary.json',
  '/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/debug_results_entity_matching/matching_detailed_results.csv']}

### ML Based Matcher

In [354]:
from PyDI.entitymatching import FeatureExtractor

comparators_ml = [
    StringComparator(column='title_norm', similarity_function='cosine'),

    StringComparator(column='platform', similarity_function='identity', preprocess=str.lower),

    NumericComparator(column="release_year",method="within_range",max_difference=0),   # exact match only

]

feature_extractor = FeatureExtractor(comparators_ml)

train_features_p2m = feature_extractor.create_features(
    playtime, metacritic, train_p2m[['id1', 'id2']], labels=train_p2m['label'], id_column='id'
)

train_features_p2s = feature_extractor.create_features(
    playtime, vgsales, train_p2s[['id1', 'id2']], labels=train_p2s['label'], id_column='id'
)

feature_columns_p2m = [col for col in train_features_p2m.columns if col not in ['id1', 'id2', 'label']]
X_train_p2m = train_features_p2m[feature_columns_p2m]
y_train_p2m = train_features_p2m['label']

feature_columns_p2s = [col for col in train_features_p2s.columns if col not in ['id1', 'id2', 'label']]
X_train_p2s = train_features_p2s[feature_columns_p2s]
y_train_p2s = train_features_p2s['label']

training_datasets = [(X_train_p2m, y_train_p2m),(X_train_p2s, y_train_p2s)]

[INFO ] root - Label distribution: 375 positive, 420 negative
[INFO ] root - Label distribution: 378 positive, 418 negative


In [355]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# classifiers
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# parameter grids
param_grids = {
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [None, 10, 20],
        'class_weight': ['balanced', None],
        'min_samples_split': [2, 5]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    }
}

scorer = make_scorer(f1_score)
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


best_models = []  # one best model per dataset

for (X_train, y_train) in training_datasets:
    grid_search_results = {}
    best_overall_score = -1
    best_overall_model = None
    best_model_name = None

    for name, model in classifiers.items():
        print(f"Running GridSearchCV for {name}...")
        
        grid = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            scoring=scorer,
            cv=cv_folds,
            n_jobs=-1,
            verbose=0
        )
        
        grid.fit(X_train, y_train)
        print(
            f"{name}: best F1 = {grid.best_score_:.4f} "
            f"with params {grid.best_params_}"
        )
        
        grid_search_results[name] = {
            'grid_search': grid,
            'best_score': grid.best_score_,
            'best_params': grid.best_params_,
            'best_estimator': grid.best_estimator_
        }

        if grid.best_score_ > best_overall_score:
            best_overall_score = grid.best_score_   
            best_overall_model = grid.best_estimator_
            best_model_name = name

    print(f"Best model for this dataset: {best_model_name} with F1={best_overall_score:.4f}")
    best_models.append(best_overall_model)


Running GridSearchCV for RandomForestClassifier...
RandomForestClassifier: best F1 = 0.9801 with params {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Running GridSearchCV for GradientBoostingClassifier...
GradientBoostingClassifier: best F1 = 0.9762 with params {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Running GridSearchCV for SVC...
SVC: best F1 = 0.9867 with params {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Running GridSearchCV for LogisticRegression...
LogisticRegression: best F1 = 0.9867 with params {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Best model for this dataset: SVC with F1=0.9867
Running GridSearchCV for RandomForestClassifier...
RandomForestClassifier: best F1 = 0.9681 with params {'class_weight': None, 'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
Running GridSearchCV for GradientBoostingClassifier...
GradientBoostingClassifier: best F1 = 0.9669 with params {'learning_rate': 0.05, 'm

In [356]:
display(best_models)

[SVC(C=0.1, probability=True, random_state=42),
 RandomForestClassifier(n_estimators=200, random_state=42)]

In [357]:
from PyDI.entitymatching import MLBasedMatcher

ml_matcher = MLBasedMatcher(feature_extractor)


correspondences_p2m = ml_matcher.match(
    playtime, metacritic,
    candidates=sn_blocker_p2m,
    id_column='id',
    trained_classifier=best_models[0],
    use_probabilities=True,
    threshold=0.8
)

correspondences_p2s = ml_matcher.match(
    playtime, vgsales,
    candidates=sn_blocker_p2s,
    id_column='id',
    trained_classifier=best_models[1],
    use_probabilities=True,
    threshold=0.8
)

[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Blocking 84461 x 14666 elements
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Matching 84461 x 14666 elements after 0:00:0.249; 466456 blocked pairs (reduction ratio: 0.999623432544303)
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Entity Matching finished after 0:00:64.477; found 8866 correspondences.
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Starting Entity Matching
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Blocking 84461 x 16327 elements
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Matching 84461 x 16327 elements after 0:00:0.205; 482744 blocked pairs (reduction ratio: 0.9996499305011494)
[INFO ] PyDI.entitymatching.ml_based.MLBasedMatcher - Entity Matching finished after 0:00:67.888; found 125226 correspondences.


In [358]:
cluster_analysis_dir = OUTPUT_DIR / "cluster_analysis"
cluster_analysis_dir.mkdir(parents=True, exist_ok=True)
cluster_distribution_p2m = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=correspondences_p2m,
    out_dir=cluster_analysis_dir
)
cluster_distribution_p2s = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=correspondences_p2s,
    out_dir=cluster_analysis_dir
)

[INFO ] root - Cluster Size Distribution of 8561 clusters:
[INFO ] root - 	Cluster Size	| Frequency	| Percentage
[INFO ] root - 	â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[INFO ] root - 		2	|	8357	|	97.62%
[INFO ] root - 		3	|	156	|	1.82%
[INFO ] root - 		4	|	31	|	0.36%
[INFO ] root - 		5	|	15	|	0.18%
[INFO ] root - 		6	|	2	|	0.02%
[INFO ] root - Cluster size distribution written to /Users/onurcanmemis/Desktop/web-data-integration-team-project/output/cluster_analysis/cluster_size_distribution.csv
[INFO ] root - Cluster Size Distribution of 3540 clusters:
[INFO ] root - 	Cluster Size	| Frequency	| Percentage
[INFO ] root - 	â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
[INFO ] root - 		2	|	803	|	22.68%
[INFO ] root - 		3	|	354	|	10.00%
[INFO ] root - 		4	|	334	|	9.44%
[INFO ] 

### ML Matching Evaluation
#### Playtime -> Metacritic

In [359]:
eval_p2m = EntityMatchingEvaluator.evaluate_matching(
                correspondences=correspondences_p2m,
                test_pairs=test_p2m,
                out_dir=BLOCK_EVAL_DIR,
            )

display(eval_p2m)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  91
[INFO ] root -   True Negatives:  105
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 3
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.985
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.968
[INFO ] root -   F1-Score:  0.984


{'precision': 1.0,
 'recall': 0.9680851063829787,
 'f1': 0.9837837837837838,
 'accuracy': 0.9849246231155779,
 'true_positives': 91,
 'false_positives': 0,
 'false_negatives': 3,
 'true_negatives': 105,
 'threshold_used': 0.0,
 'total_correspondences': 8866,
 'filtered_correspondences': 8866,
 'evaluation_timestamp': '2025-11-23T00:43:04.001415',
 'output_files': ['/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/blocking_evaluation/matching_evaluation_summary.json',
  '/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/blocking_evaluation/matching_detailed_results.csv']}

#### Playtime -> Sales

In [360]:
eval_p2s = EntityMatchingEvaluator.evaluate_matching(
                correspondences=correspondences_p2s,
                test_pairs=test_p2s,
                out_dir=BLOCK_EVAL_DIR,
            )

display(eval_p2s)

[INFO ] root - Confusion Matrix:
[INFO ] root -   True Positives:  82
[INFO ] root -   True Negatives:  104
[INFO ] root -   False Positives: 0
[INFO ] root -   False Negatives: 13
[INFO ] root - Performance Metrics:
[INFO ] root -   Accuracy:  0.935
[INFO ] root -   Precision: 1.000
[INFO ] root -   Recall:    0.863
[INFO ] root -   F1-Score:  0.927


{'precision': 1.0,
 'recall': 0.8631578947368421,
 'f1': 0.9265536723163842,
 'accuracy': 0.9346733668341709,
 'true_positives': 82,
 'false_positives': 0,
 'false_negatives': 13,
 'true_negatives': 104,
 'threshold_used': 0.0,
 'total_correspondences': 125226,
 'filtered_correspondences': 125226,
 'evaluation_timestamp': '2025-11-23T00:43:08.034878',
 'output_files': ['/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/blocking_evaluation/matching_evaluation_summary.json',
  '/Users/onurcanmemis/Desktop/web-data-integration-team-project/output/blocking_evaluation/matching_detailed_results.csv']}