In [39]:
from pathlib import Path
import pandas as pd
pd.set_option("display.max_columns", None) 

ROOT = Path.cwd()

DATA_DIR = ROOT / "parquet"
MLDS_DIR = ROOT / "ml-datasets"
OUTPUT_DIR = ROOT / "output"
LOGS = ROOT / "logs"
BLOCK_EVAL_DIR = OUTPUT_DIR / "blocking_evaluation"
CORR_DIR = OUTPUT_DIR / "correspondences"

BLOCK_EVAL_DIR.mkdir(parents=True, exist_ok=True)
CORR_DIR.mkdir(parents=True, exist_ok=True)

LOGS.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

In [40]:
from PyDI.io import load_parquet
import re, unicodedata



metacritic = load_parquet(
    DATA_DIR / "df_metacritic.parquet",
    name="metacritic"
)

playtime = load_parquet(
    DATA_DIR / "df_playtime.parquet",
    name="playtime"
)

vgsales = load_parquet(
  DATA_DIR / "df_videogamesales.parquet",
  name="videogamesales"

)

In [41]:
def normalize_title(text):
    if not isinstance(text, str):
        return ""
    # remove accents/diacritics (Ã± -> n)
    text = unicodedata.normalize("NFKD", text).encode("ASCII", "ignore").decode("ASCII")
    cleaned = re.sub(r"[^a-zA-Z0-9]+", " ", text)
    return cleaned.lower().strip()


metacritic["title_norm"] = metacritic["title"].apply(normalize_title)
playtime["title_norm"] = playtime["title"].apply(normalize_title)
vgsales["title_norm"] = vgsales["title"].apply(normalize_title)

In [42]:
from PyDI.entitymatching import (StandardBlocker,
                                 SortedNeighbourhoodBlocker,
                                 TokenBlocker,
                                 EmbeddingBlocker,
                                 RuleBasedMatcher,
                                 StringComparator,
                                 NumericComparator,
                                 EntityMatchingEvaluator)

In [43]:
train_p2m = load_parquet(
    MLDS_DIR / "train_PM.parquet",
    name="train_playtime_metacritic",
    add_index=False
)

test_p2m = load_parquet(
    MLDS_DIR / "test_PM.parquet",
    name="test_playtime_metacritic",
    add_index=False
)

train_p2s = load_parquet(
    MLDS_DIR / "train_PS.parquet",
    name="train_playtime_sales",
    add_index=False
)

test_p2s = load_parquet(
    MLDS_DIR / "test_PS.parquet",
    name="test_playtime_sales",
    add_index=False
)
train_p2m = train_p2m.rename(columns={"id_left": "id1", "id_right": "id2"})
test_p2m = test_p2m.rename(columns={"id_left": "id1", "id_right": "id2"})
train_p2s = train_p2s.rename(columns={"id_left": "id1", "id_right": "id2"})
test_p2s = test_p2s.rename(columns={"id_left": "id1", "id_right": "id2"})

In [44]:
st_blocker_p2m = StandardBlocker(
    playtime, metacritic,
    on=['title','developer'],
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

standard_candidates_p2m = st_blocker_p2m.materialize()

sn_blocker_p2m = SortedNeighbourhoodBlocker(
    playtime, metacritic,
    key='title',
    window=20,
    batch_size=750,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

sn_candidates_p2m = sn_blocker_p2m.materialize()


embedding_blocker_p2m = EmbeddingBlocker(
    playtime, metacritic,
    text_cols=['title', 'platform','release_year'],
    model="sentence-transformers/all-MiniLM-L6-v2",
    index_backend="sklearn",
    top_k=10,
    batch_size=500,
    threshold=0.8,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

embedding_candidates_p2m = embedding_blocker_p2m.materialize()

st_blocker_p2s = StandardBlocker(
    playtime, vgsales,
    on=['title','publisher'],
    batch_size=1000,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

standard_candidates_p2s = st_blocker_p2s.materialize()

sn_blocker_p2s = SortedNeighbourhoodBlocker(
    playtime, vgsales,
    key='title',
    window=20,
    batch_size=750,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

sn_candidates_p2s = sn_blocker_p2s.materialize()

embedding_blocker_p2s = EmbeddingBlocker(
    playtime, vgsales,
    text_cols=['title', 'platform','release_year'],
    model="sentence-transformers/all-MiniLM-L6-v2",
    index_backend="sklearn",
    top_k=10,
    threshold=0.8,
    batch_size=500,
    output_dir=BLOCK_EVAL_DIR,
    id_column='id'
)

embedding_candidates_p2s = embedding_blocker_p2s.materialize()

In [45]:
from PyDI.entitymatching import FeatureExtractor

comparators_ml = [
    StringComparator(column='title_norm', similarity_function='cosine'),
    StringComparator(column='title_norm', similarity_function='jaro_winkler'),

    StringComparator(column='platform', similarity_function='identity', preprocess=str.lower),

    NumericComparator(column="release_year",method="within_range",max_difference=0),   # exact match only

]

feature_extractor = FeatureExtractor(comparators_ml)

train_features_p2m = feature_extractor.create_features(
    playtime, metacritic, train_p2m[['id1', 'id2']], labels=train_p2m['label'], id_column='id'
)

train_features_p2s = feature_extractor.create_features(
    playtime, vgsales, train_p2s[['id1', 'id2']], labels=train_p2s['label'], id_column='id'
)

feature_columns_p2m = [col for col in train_features_p2m.columns if col not in ['id1', 'id2', 'label']]
X_train_p2m = train_features_p2m[feature_columns_p2m]
y_train_p2m = train_features_p2m['label']

feature_columns_p2s = [col for col in train_features_p2s.columns if col not in ['id1', 'id2', 'label']]
X_train_p2s = train_features_p2s[feature_columns_p2s]
y_train_p2s = train_features_p2s['label']

training_datasets = [(X_train_p2m, y_train_p2m),(X_train_p2s, y_train_p2s)]

In [46]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# classifiers
classifiers = {
    'RandomForestClassifier': RandomForestClassifier(random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42),
    'SVC': SVC(probability=True, random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000, random_state=42)
}

# parameter grids
param_grids = {
    'RandomForestClassifier': {
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [None, 10, 20],
        'class_weight': ['balanced', None],
        'min_samples_split': [2, 5]
    },
    'GradientBoostingClassifier': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5]
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto']
    },
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'liblinear']
    }
}

scorer = make_scorer(f1_score)
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


best_models = []  # one best model per dataset

for (X_train, y_train) in training_datasets:
    grid_search_results = {}
    best_overall_score = -1
    best_overall_model = None
    best_model_name = None

    for name, model in classifiers.items():
        print(f"Running GridSearchCV for {name}...")
        
        grid = GridSearchCV(
            estimator=model,
            param_grid=param_grids[name],
            scoring=scorer,
            cv=cv_folds,
            n_jobs=-1,
            verbose=0
        )
        
        grid.fit(X_train, y_train)
        print(
            f"{name}: best F1 = {grid.best_score_:.4f} "
            f"with params {grid.best_params_}"
        )
        
        grid_search_results[name] = {
            'grid_search': grid,
            'best_score': grid.best_score_,
            'best_params': grid.best_params_,
            'best_estimator': grid.best_estimator_
        }

        if grid.best_score_ > best_overall_score:
            best_overall_score = grid.best_score_   
            best_overall_model = grid.best_estimator_
            best_model_name = name

    print(f"Best model for this dataset: {best_model_name} with F1={best_overall_score:.4f}")
    best_models.append(best_overall_model)


Running GridSearchCV for RandomForestClassifier...
RandomForestClassifier: best F1 = 0.9854 with params {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Running GridSearchCV for GradientBoostingClassifier...
GradientBoostingClassifier: best F1 = 0.9828 with params {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Running GridSearchCV for SVC...
SVC: best F1 = 0.9881 with params {'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}
Running GridSearchCV for LogisticRegression...
LogisticRegression: best F1 = 0.9869 with params {'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}
Best model for this dataset: SVC with F1=0.9881
Running GridSearchCV for RandomForestClassifier...
RandomForestClassifier: best F1 = 0.9646 with params {'class_weight': 'balanced', 'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}
Running GridSearchCV for GradientBoostingClassifier...
GradientBoostingClassifier: best F1 = 0.9646 with params {'learning_rate': 0.0

In [55]:
from PyDI.entitymatching import MLBasedMatcher

ml_matcher = MLBasedMatcher(feature_extractor)


correspondences_p2m = ml_matcher.match(
    playtime, metacritic,
    candidates=embedding_candidates_p2m,
    id_column='id',
    trained_classifier=best_models[0],
    use_probabilities=True,
    threshold=0.85
)

correspondences_p2s = ml_matcher.match(
    playtime, vgsales,
    candidates=embedding_candidates_p2s,
    id_column='id',
    trained_classifier=best_models[1],
    use_probabilities=True,
    threshold=0.85
)

In [56]:
cluster_analysis_dir = OUTPUT_DIR / "cluster_analysis"
cluster_analysis_dir.mkdir(parents=True, exist_ok=True)
cluster_distribution_p2m = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=correspondences_p2m,
    out_dir=cluster_analysis_dir
)
cluster_distribution_p2s = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=correspondences_p2s,
    out_dir=cluster_analysis_dir
)
print(f"\nðŸ“Š Cluster Size Distribution Results (Playtime -> Metacritic):")
display(cluster_distribution_p2m)
print(f"\nðŸ“Š Cluster Size Distribution Results (Playtime -> VGSales):")
display(cluster_distribution_p2s)


ðŸ“Š Cluster Size Distribution Results (Playtime -> Metacritic):


Unnamed: 0,cluster_size,frequency,percentage
0,2,3581,49.122085
1,3,941,12.908093
2,4,791,10.85048
3,5,515,7.064472
4,6,499,6.844993
5,7,272,3.731139
6,8,220,3.017833
7,9,129,1.769547
8,10,70,0.960219
9,11,69,0.946502



ðŸ“Š Cluster Size Distribution Results (Playtime -> VGSales):


Unnamed: 0,cluster_size,frequency,percentage
0,2,5915,92.508602
1,3,329,5.145449
2,4,93,1.454489
3,5,31,0.48483
4,6,12,0.187676
5,7,7,0.109478
6,8,5,0.078198
7,9,2,0.031279


In [57]:
from PyDI.entitymatching import MaximumBipartiteMatching, StableMatching

# We are using Maxmimum Bipartite Matching to refine results to 1:1 matches
clusterer = MaximumBipartiteMatching()
mbm_correspondences_p2m = clusterer.cluster(correspondences_p2m)
mbm_correspondences_p2s = clusterer.cluster(correspondences_p2s)
cluster_distribution_p2m = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=mbm_correspondences_p2m
)
cluster_distribution_p2s = EntityMatchingEvaluator.create_cluster_size_distribution(
    correspondences=mbm_correspondences_p2s
)
print(f"\nðŸ“Š Cluster Size Distribution Results (Playtime -> Metacritic):")
display(cluster_distribution_p2m)
print(f"\nðŸ“Š Cluster Size Distribution Results (Playtime -> VGSales):")
display(cluster_distribution_p2s)


ðŸ“Š Cluster Size Distribution Results (Playtime -> Metacritic):


Unnamed: 0,cluster_size,frequency,percentage
0,2,11059,100.0



ðŸ“Š Cluster Size Distribution Results (Playtime -> VGSales):


Unnamed: 0,cluster_size,frequency,percentage
0,2,6549,100.0


In [63]:
import numpy as np
all_correspondences = pd.concat([mbm_correspondences_p2m, mbm_correspondences_p2s], ignore_index=True)


datasets = [playtime, metacritic, vgsales]

for i, df in enumerate(datasets):
    df["genres"] = df["genres"].apply(
        lambda x: x.tolist() if isinstance(x, np.ndarray) else x
    )
    datasets[i] = df

len(all_correspondences)

17608

In [64]:
golden_fusion_dataset=pd.read_csv(MLDS_DIR / "Golden_Fusion_Dataset.csv")
golden_fusion_dataset.head(2)

Unnamed: 0,id,title_provenance,title,platform_provenance,platform,release_year_provenance,release_year,developer_provenance,developer,publisher_provenance,publisher,genres_provenance,genres,global_sales_mil_provenance,global_sales_mil,main_story_hours_provenance,main_story_hour,main_plus_sides_hour_provenance,main_plus_sides_hour,completionist_hour_provenance,completionist_hour,critic_score_provenance,critic_score,esrb_rating_provenance,esrb_rating
0,playtime_21+sales_5232+metacritic_14639,metacritic_14639,007 Legends,metacritic_14639,PlayStation 3,metacritic_14639,2012,playtime_21,Eurocom,playtime_21,Activision,playtime_21+sales_5232+metacritic_14639,"['First-Person', 'Shooter', 'Action', 'Modern'...",sales_5232,0.36,playtime_21,6.7,playtime_21,7.95,playtime_21,9.58,metacritic_14639,41,metacritic_14639,T
1,playtime_4075+sales_841+metacritic_261,metacritic_261,Dead Space,metacritic_261,PlayStation 3,metacritic_261,2008,playtime_4075,Visceral Games,playtime_4075,Electronic Arts,playtime_4075+sales_841+metacritic_261,"['Horror', 'Shooter', 'Survival', 'Third-Perso...",sales_841,2.02,playtime_4075,11.07,playtime_4075,13.18,playtime_4075,20.42,metacritic_261,88,metacritic_261,M


In [65]:
from PyDI.fusion import (DataFusionStrategy,
                         DataFusionEngine,
                         longest_string,
                         union,
                         prefer_higher_trust,
                         voting)

metacritic.attrs["trust_score"] = 3
playtime.attrs["trust_score"] = 2
vgsales.attrs["trust_score"] = 1

# merge rule based correspondences
all_ml_correspondences = pd.concat([correspondences_p2m, correspondences_p2s], ignore_index=True)

# define data fusion strategy
strategy = DataFusionStrategy('video_games_fusion_strategy')

strategy.add_attribute_fuser('title', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('platform', voting)
strategy.add_attribute_fuser('release_year', voting)
strategy.add_attribute_fuser('genres', union)
strategy.add_attribute_fuser('developer', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('publisher', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('critic_score', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('main_story_hour', prefer_higher_trust, trust_key="trust_score")
strategy.add_attribute_fuser('global_sales_mil', prefer_higher_trust, trust_key="trust_score")


# run fusion
engine = DataFusionEngine(strategy, debug=True, debug_format='json',
                          debug_file="output/data_fusion/debug_fusion_ml_standard_blocker.jsonl")

fused_dataset = engine.run(
    datasets=datasets,
    correspondences=all_correspondences,
    id_column="id",
    include_singletons=False,
)
def collect_ids_ordered(meta, prefixes=("playtime_", "sales_", "metacritic_")):
    buckets = {p: [] for p in prefixes}

    def add(s):
        if isinstance(s, str):
            for p in prefixes:
                if s.startswith(p) and s not in buckets[p]:
                    buckets[p].append(s)
                    break

    def walk(obj):
        if isinstance(obj, dict):
            for v in obj.values():
                walk(v)
        elif isinstance(obj, list):
            for v in obj:
                walk(v)
        else:
            add(obj)

    walk(meta)
    ordered = []
    for p in prefixes:
        ordered.extend(buckets[p])
    return "+".join(ordered)

# Apply to a DataFrame column of metadata dicts
fused_dataset["fusion_id"] = fused_dataset["_fusion_metadata"].apply(collect_ids_ordered)
fused_dataset["_id"]= fused_dataset["fusion_id"]
fused_dataset.drop(columns=["fusion_id"], inplace=True)

print(f'Fused rows: {len(fused_dataset):,}')
fused_dataset.sample(2)

Fused rows: 14,895


Unnamed: 0,_id,_fusion_group_id,_fusion_sources,completionist_hour,main_plus_sides_hour,critic_score,publisher,global_sales_mil,main_story_hour,jp_sales_mil,release_year,user_score,developer,platform,na_sales_mil,title_norm,title,genres,esrb_rating,other_sales_mil,eu_sales_mil,id,_fusion_confidence,_fusion_metadata
2522,playtime_62830+metacritic_14318,group_2522,"[metacritic, playtime]",17.97,9.77,58.0,2K Sports,,8.8,,2020.0,5.2,2K Games,Xbox One,,wwe 2k battlegrounds,WWE 2K Battlegrounds,"[Combat, Fighting, Individual, Sports, Wrestling]",T,,,metacritic_14318,0.769231,"{'release_year_rule': 'voting', 'release_year_..."
11792,playtime_47724+metacritic_3017,group_11792,"[metacritic, playtime]",6.32,32.0,81.0,Rockstar Games,,7.6,,2019.0,7.0,Rockstar Games,PlayStation 4,,l a noire the vr case files,L.A. Noire: The VR Case Files,"[3D, Action, Adventure, First-Person, Virtual ...",M,,,metacritic_3017,0.769231,"{'release_year_rule': 'voting', 'release_year_..."


In [66]:
from PyDI.fusion import tokenized_match, boolean_match,numeric_tolerance_match,set_equality_match
from PyDI.fusion import DataFusionEvaluator
import numpy as np
import re, ast, numpy as np, pandas as pd


def categories_set_equal(a, b) -> bool:
    """Return True if a and b contain the same unique categories (order/type agnostic)."""
    def to_set(x):
        def items(v):
            # missing
            if v is None or (isinstance(v, float) and np.isnan(v)): return []
            # numpy array â†’ recurse over elements
            if isinstance(v, np.ndarray): 
                out=[]; [out.extend(items(e)) for e in v.flatten()]; return out
            # python containers â†’ recurse over elements
            if isinstance(v, (list, tuple, set)):
                out=[]; [out.extend(items(e)) for e in v]; return out
            # scalar/string: try parse stringified list; else split by delimiters
            s = str(v).strip()
            if s == "" or s.lower() in {"nan","none"}: return []
            try:
                parsed = ast.literal_eval(s)
                if isinstance(parsed, (list, tuple, set)): return items(parsed)
            except Exception:
                pass
            return [p.strip() for p in re.split(r"[|,;/]", s) if p.strip()]
        return {it.lower() for it in items(x)}
    return to_set(a) == to_set(b)

strategy.add_evaluation_function("title", tokenized_match)
strategy.add_evaluation_function("platform", tokenized_match)
strategy.add_evaluation_function("release_year", tokenized_match)
strategy.add_evaluation_function("genres", categories_set_equal)
strategy.add_evaluation_function("developer", tokenized_match)
strategy.add_evaluation_function("publisher", tokenized_match)
strategy.add_evaluation_function("critic_score", numeric_tolerance_match)
strategy.add_evaluation_function("main_story_hour", numeric_tolerance_match)
strategy.add_evaluation_function("global_sales_mil", numeric_tolerance_match)


# Create evaluator with our fusion strategy
evaluator = DataFusionEvaluator(strategy, debug=True, debug_file=OUTPUT_DIR / "data_fusion" / "debug_fusion_eval.jsonl", debug_format="json")

# Evaluate the fused results against the gold standard
print("Evaluating fusion results against gold standard...")
evaluation_results = evaluator.evaluate(
    fused_df=fused_dataset,
    fused_id_column='_id',
    gold_df=golden_fusion_dataset,
    gold_id_column='id',
)

# Display evaluation metrics
print("\nFusion Evaluation Results:")
print("=" * 40)
for metric, value in evaluation_results.items():
    if isinstance(value, float):
        print(f"  {metric}: {value:.3f}")
    else:
        print(f"  {metric}: {value}")
        
print(f"\nOverall Accuracy: {evaluation_results.get('overall_accuracy', 0):.1%}")



Evaluating fusion results against gold standard...

Fusion Evaluation Results:
  overall_accuracy: 0.426
  macro_accuracy: 0.428
  num_evaluated_records: 22
  num_evaluated_attributes: 12
  total_evaluations: 258
  total_correct: 110
  main_story_hour_accuracy: 0.524
  main_story_hour_count: 21
  release_year_accuracy: 0.000
  release_year_count: 22
  completionist_hour_accuracy: 0.500
  completionist_hour_count: 18
  main_plus_sides_hour_accuracy: 0.524
  main_plus_sides_hour_count: 21
  critic_score_accuracy: 0.500
  critic_score_count: 22
  publisher_accuracy: 0.455
  publisher_count: 22
  title_accuracy: 0.455
  title_count: 22
  genres_accuracy: 0.500
  genres_count: 22
  esrb_rating_accuracy: 0.500
  esrb_rating_count: 22
  developer_accuracy: 0.182
  developer_count: 22
  platform_accuracy: 0.500
  platform_count: 22
  global_sales_mil_accuracy: 0.500
  global_sales_mil_count: 22

Overall Accuracy: 42.6%
