# Passive Learning 
- by Python Record Linkage toolkit


## Preprocessing

In [12]:
import pandas as pd
import recordlinkage as rl
from recordlinkage.compare import Exact, String, Numeric, Date
from sklearn.ensemble import RandomForestClassifier
import numpy as np

pd.set_option('display.max_columns', None)     # Show all columns
pd.set_option('display.max_colwidth', None)    # Show full column content
pd.set_option('display.width', None)           # No width limit
pd.set_option('display.max_rows', None)        # Show all rows


# Load the CSV file
birth_path = 'linkage data/_1/clean/birth_records.csv'
birth_data = pd.read_csv(birth_path)

death_path = 'linkage data/_1/clean/death_records.csv'
death_data = pd.read_csv(death_path)

marriage_path = 'linkage data/_1/clean/marriage_records.csv'
marriage_data = pd.read_csv(marriage_path)

# Display the data
birth_data.head()  # Display the first few rows of the dataset
death_data.head()
marriage_data.head()

Unnamed: 0,ID,day,month,year,place of marriage,forename of groom,surname of groom,occupation of groom,marital status of groom,age of groom,address of groom,forename of bride,surname of bride,occupation of bride,marital status of bride,age of bride,address of bride,groom's father's forename,groom's father's surname,groom's father's occupation,if groom's father deceased,groom's mother's forename,groom's mother's maiden surname,if groom's mother deceased,bride's father's forename,bride's father's surname,bride's father's occupation,if bride's father deceased,bride's mother's forename,bride's mother's maiden surname,if bride's mother deceased,notes1,gdeath,bdeath,GROOM_IDENTITY,BRIDE_IDENTITY,GROOM_MOTHER_IDENTITY,GROOM_FATHER_IDENTITY,BRIDE_MOTHER_IDENTITY,BRIDE_FATHER_IDENTITY,GROOM_BIRTH_RECORD_IDENTITY,BRIDE_BIRTH_RECORD_IDENTITY,GROOM_FATHER_BIRTH_RECORD_IDENTITY,GROOM_MOTHER_BIRTH_RECORD_IDENTITY,BRIDE_FATHER_BIRTH_RECORD_IDENTITY,BRIDE_MOTHER_BIRTH_RECORD_IDENTITY
0,2051,13,MAY,1970,,Ethan,Navarro,IRON MOULDER,B,22,"1 Braeview, South Broomage, Stenhousemuir",Helen,Lopez,HAWKER,S,31,"1 Braeview, South Broomage, Stenhousemuir",William,Navarro,WEST INDIA MERCHANT,,Clara,Olsson,,James,Lopez,BRACE MAKER,,Sarah,Andersson,,SYNTHETIC DATA PRODUCED USING VALIPOP,6254,5938,6254,5938,6253,6252,5214,4935,6254,5938,6252,6253,4935,5214
1,2052,16,JANUARY,1970,Eden House Banff,Chas,Harutyunyan,BOOT & SHOE MAKER,B,25,"481 Low Road, Highland",Ann,Dumitru,HOUSE SERVT,S,21,"58 Douglas Square, Scottish Borders",Peter,Harutyunyan,MERCHANT,,Ann,Hernandez,,John,Dumitru,OFFICER OF INLAND REVNUE (EXCISE),,Isabella,Larsson,,SYNTHETIC DATA PRODUCED USING VALIPOP,5933,6048,5933,6048,5054,4711,5296,5205,5933,6048,4711,5054,5205,5296
2,2053,18,MAY,1970,Dumfries and Galloway,George,Harutyunyan,UPHOLSTERER APPRENTICE,B,28,"1 Newlands Lane North, Cove Bay, Aberdeen City",Kate,Babic,DOMESTIC SERVANT,S,23,"1 Blackpark View, Stranraer, Dumfries and Galloway",Peter,Harutyunyan,COAL MINER,,Catherin,Nguyen,,Robert,Babic,COAL MINER,,Agnes,Visser,,SYNTHETIC DATA PRODUCED USING VALIPOP,5857,6044,5857,6044,4908,4803,5186,4393,5857,6044,4803,4908,4393,5186
3,2054,20,FEBRUARY,1970,Hownam Grange Scottish Borders,Angus,Dumitru,SCHOLAR,B,18,"58 Douglas Square, Scottish Borders",Margaret,Wozniak,SCHOLAR,S,19,"1592 Castle Road, South Ayrshire",John,Dumitru,OFFICER OF INLAND REVNUE (EXCISE),,Isabella,Larsson,,Andrew,Wozniak,LATH SPLITTER,,Margaret,Szabo,,SYNTHETIC DATA PRODUCED USING VALIPOP,6169,6118,6169,6118,5296,5205,4870,4817,6169,6118,5205,5296,4817,4870
4,2055,15,OCTOBER,1970,Allanbank Lauder,Lukas,Kuznetsov,SCHOLAR,B,20,"1 Springbank Court, Parkhead, Glasgow City",Helen,Mohan,SEWER AT WHITE SEAM,S,21,"1 Balgonie Woods, Ferguslie Park, Paisley",Tim,Kuznetsov,,,Anna,Gonzalez,,Josef,Mohan,FISHERMAN,,Elena,Svensson,,SYNTHETIC DATA PRODUCED USING VALIPOP,6567,6120,6567,6120,6566,6565,5650,5649,6567,6120,6565,6566,5649,5650


In [13]:

print(f"Birth records: {len(birth_data)} rows")
print(f"Death records: {len(death_data)} rows") 
print(f"Marriage records: {len(marriage_data)} rows")

# Display basic info about each dataset
print("\n=== Birth Data Columns ===")
print(birth_data.columns.tolist())
print("\n=== Death Data Columns ===")
print(death_data.columns.tolist())
print("\n=== Marriage Data Columns ===")
print(marriage_data.columns.tolist())

Birth records: 2508 rows
Death records: 1832 rows
Marriage records: 711 rows

=== Birth Data Columns ===
['ID', 'family', 'marriage', "child's forname(s)", "child's surname", 'birth day', 'birth month', 'birth year', 'address', 'sex', "father's forename", "father's surname", "father's occupation", "mother's forename", "mother's maiden surname", "mother's occupation", "day of parents' marriage", "month of parents' marriage", "year of parents' marriage", "place of parent's marriage", 'illegit', 'notes', 'Death', 'CHILD_IDENTITY', 'MOTHER_IDENTITY', 'FATHER_IDENTITY', 'DEATH_RECORD_IDENTITY', 'PARENT_MARRIAGE_RECORD_IDENTITY', 'FATHER_BIRTH_RECORD_IDENTITY', 'MOTHER_BIRTH_RECORD_IDENTITY', 'MARRIAGE_RECORD_IDENTITY1', 'MARRIAGE_RECORD_IDENTITY2', 'MARRIAGE_RECORD_IDENTITY3', 'MARRIAGE_RECORD_IDENTITY4', 'MARRIAGE_RECORD_IDENTITY5']

=== Death Data Columns ===
['ID', 'forename(s) of deceased', 'surname of deceased', 'occupation', 'marital status', 'sex', 'name of spouse', "spouse's occ", '

### Preprocessing data function

In [14]:
def preprocess_data(df):
    """Clean and standardize data for better matching"""
    df_clean = df.copy()
    
    # Convert string columns to lowercase and strip whitespace
    string_cols = df_clean.select_dtypes(include=['object']).columns
    for col in string_cols:
        df_clean[col] = df_clean[col].astype(str).str.lower().str.strip()
        # Replace 'nan' string with actual NaN
        df_clean[col] = df_clean[col].replace('nan', np.nan)
    
    return df_clean

# Preprocess all datasets
birth_clean = preprocess_data(birth_data)
death_clean = preprocess_data(death_data)
marriage_clean = preprocess_data(marriage_data)

## Indexing and Comparing
- link 2 datasets

### Birth-Death linkage

In [None]:
# 1. BIRTH-DEATH LINKAGE

def link_birth_death(birth_df, death_df):
    print("\n=== LINKING BIRTH AND DEATH RECORDS ===")
    
    # Indexation step - block on surname for efficiency
    indexer = rl.Index()
    indexer.block(left_on="child's surname", right_on="surname of deceased")
    
    candidate_links = indexer.index(birth_df, death_df)
    print(f"Candidate pairs: {len(candidate_links)}")
    
    # Comparison step using low-level syntax
    comparer = rl.Compare([
        String("child's forname(s)", "forename(s) of deceased", 
               method='jarowinkler', threshold=0.8, label='forename_similarity'),
        
        Exact("child's surname", "surname of deceased", 
              label='surname_match'),
        
        String("father's forename", "father's forename", 
               method='jarowinkler', threshold=0.8, label='father_forename_similarity'),
        
        Exact("father's surname", "father's surname", 
              label='father_surname_match'),
        
        String("mother's forename", "mother's forename",
               method='jarowinkler', threshold=0.8, label='mother_forename_similarity'),
        
        Exact("mother's maiden surname", "mother's maiden surname", 
              label='mother_surname_match'),
        
        String("address", "address", 
               method='jarowinkler', threshold=0.7, label='address_similarity'),
        
        # Age consistency check (birth year + age at death ≈ death year)
        Numeric("birth year", "year", 
                method='linear', scale=5, offset=0, missing_value=0.0, 
                label='year_consistency')
    ])
    
    # Compute features
    features = comparer.compute(candidate_links, birth_df, death_df)
    print(f"Features computed for {len(features)} pairs")
    
    return features, candidate_links


### Birth-Marriage linkage

In [16]:

# ============================================================================
# 2. BIRTH-MARRIAGE LINKAGE (Birth child as groom/bride)
# ============================================================================

def link_birth_marriage_groom(birth_df, marriage_df):
    print("\n=== LINKING BIRTH AND MARRIAGE RECORDS (GROOM) ===")
    
    # Indexation step
    indexer = rl.Index()
    indexer.block(left_on="child's surname", right_on="surname of groom")
    candidate_links = indexer.index(birth_df, marriage_df)
    print(f"Candidate pairs (groom): {len(candidate_links)}")
    
    # Comparison step using low-level syntax
    comparer = rl.Compare([
        String("child's forname(s)", "forename of groom",
               method='jarowinkler', threshold=0.8, label='forename_similarity'),
        
        Exact("child's surname", "surname of groom", 
              label='surname_match'),
        
        String("father's forename", "groom's father's forename",
               method='jarowinkler', threshold=0.8, label='father_forename_similarity'),
        
        Exact("father's surname", "groom's father's surname", 
              label='father_surname_match'),
        
        String("mother's forename", "groom's mother's forename",
               method='jarowinkler', threshold=0.8, label='mother_forename_similarity'),
        
        Exact("mother's maiden surname", "groom's mother's maiden surname",
              label='mother_surname_match'),
        
        String("address", "address of groom", 
               method='jarowinkler', threshold=0.7, label='address_similarity'),
        
        # Age consistency (birth year + age at marriage ≈ marriage year)
        Numeric("birth year", "year", 
                method='linear', scale=3, offset=0, missing_value=0.0, 
                label='year_consistency')
    ])
    
    features = comparer.compute(candidate_links, birth_df, marriage_df)
    print(f"Features computed for {len(features)} pairs")
    
    return features, candidate_links

def link_birth_marriage_bride(birth_df, marriage_df):
    print("\n=== LINKING BIRTH AND MARRIAGE RECORDS (BRIDE) ===")
    
    # Indexation step
    indexer = rl.Index()
    indexer.block(left_on="child's surname", right_on="surname of bride")
    candidate_links = indexer.index(birth_df, marriage_df)
    print(f"Candidate pairs (bride): {len(candidate_links)}")
    
    # Comparison step using low-level syntax
    comparer = rl.Compare([
        String("child's forname(s)", "forename of bride",
               method='jarowinkler', threshold=0.8, label='forename_similarity'),
        
        Exact("child's surname", "surname of bride", 
              label='surname_match'),
        
        String("father's forename", "bride's father's forename",
               method='jarowinkler', threshold=0.8, label='father_forename_similarity'),
        
        Exact("father's surname", "bride's father's surname",
              label='father_surname_match'),
        
        String("mother's forename", "bride's mother's forename",
               method='jarowinkler', threshold=0.8, label='mother_forename_similarity'),
        
        Exact("mother's maiden surname", "bride's mother's maiden surname",
              label='mother_surname_match'),
        
        String("address", "address of bride", 
               method='jarowinkler', threshold=0.7, label='address_similarity'),
        
        # Age consistency (birth year + age at marriage ≈ marriage year)
        Numeric("birth year", "year", 
                method='linear', scale=3, offset=0, missing_value=0.0, 
                label='year_consistency')
    ])
    
    features = comparer.compute(candidate_links, birth_df, marriage_df)
    print(f"Features computed for {len(features)} pairs")
    
    return features, candidate_links


### Death-Marriage linkage

In [17]:

# ============================================================================
# 3. DEATH-MARRIAGE LINKAGE
# ============================================================================

def link_death_marriage_groom(death_df, marriage_df):
    print("\n=== LINKING DEATH AND MARRIAGE RECORDS (GROOM) ===")
    
    # Indexation step
    indexer = rl.Index()
    indexer.block(left_on="surname of deceased", right_on="surname of groom")
    candidate_links = indexer.index(death_df, marriage_df)
    print(f"Candidate pairs (groom): {len(candidate_links)}")
    
    # Comparison step using low-level syntax
    comparer = rl.Compare([
        String("forename(s) of deceased", "forename of groom",
               method='jarowinkler', threshold=0.8, label='forename_similarity'),
        
        Exact("surname of deceased", "surname of groom", 
              label='surname_match'),
        
        String("name of spouse", "forename of bride",
               method='jarowinkler', threshold=0.8, label='spouse_name_similarity'),
        
        String("father's forename", "groom's father's forename",
               method='jarowinkler', threshold=0.8, label='father_forename_similarity'),
        
        Exact("father's surname", "groom's father's surname",
              label='father_surname_match'),
        
        String("mother's forename", "groom's mother's forename",
               method='jarowinkler', threshold=0.8, label='mother_forename_similarity'),
        
        Exact("mother's maiden surname", "groom's mother's maiden surname",
              label='mother_surname_match'),
        
        String("address", "address of groom", 
               method='jarowinkler', threshold=0.7, label='address_similarity'),
        
        # Age at death consistency
        Numeric("age at death", "age of groom", 
                method='linear', scale=5, offset=0, missing_value=0.0, 
                label='age_consistency')
    ])
    
    features = comparer.compute(candidate_links, death_df, marriage_df)
    print(f"Features computed for {len(features)} pairs")
    
    return features, candidate_links

def link_death_marriage_bride(death_df, marriage_df):
    print("\n=== LINKING DEATH AND MARRIAGE RECORDS (BRIDE) ===")
    
    # Indexation step
    indexer = rl.Index()
    indexer.block(left_on="surname of deceased", right_on="surname of bride")
    candidate_links = indexer.index(death_df, marriage_df)
    print(f"Candidate pairs (bride): {len(candidate_links)}")
    
    # Comparison step using low-level syntax
    comparer = rl.Compare([
        String("forename(s) of deceased", "forename of bride",
               method='jarowinkler', threshold=0.8, label='forename_similarity'),
        
        Exact("surname of deceased", "surname of bride", 
              label='surname_match'),
        
        String("name of spouse", "forename of groom",
               method='jarowinkler', threshold=0.8, label='spouse_name_similarity'),
        
        String("father's forename", "bride's father's forename",
               method='jarowinkler', threshold=0.8, label='father_forename_similarity'),
        
        Exact("father's surname", "bride's father's surname",
              label='father_surname_match'),
        
        String("mother's forename", "bride's mother's forename",
               method='jarowinkler', threshold=0.8, label='mother_forename_similarity'),
        
        Exact("mother's maiden surname", "bride's mother's maiden surname",
              label='mother_surname_match'),
        
        String("address", "address of bride", 
               method='jarowinkler', threshold=0.7, label='address_similarity'),
        
        # Age at death consistency  
        Numeric("age at death", "age of bride", 
                method='linear', scale=5, offset=0, missing_value=0.0, 
                label='age_consistency')
    ])
    
    features = comparer.compute(candidate_links, death_df, marriage_df)
    print(f"Features computed for {len(features)} pairs")
    
    return features, candidate_links


## Classification
- Machine Learning Model

### Random Forest Classification

In [18]:
from sklearn.ensemble import RandomForestClassifier
from recordlinkage.base import BaseClassifier
# from recordlinkage.adapters import SKLearnClassifier
from recordlinkage.adapters import SKLearnAdapter


# Create minimal Random Forest classifier
class RandomForest(SKLearnAdapter, BaseClassifier):
    def __init__(self, *args, **kwargs):
        super(RandomForest, self).__init__()
        # Set the classifier
        self.kernel = RandomForestClassifier(*args, **kwargs)

def classify_with_random_forest_supervised(features, ground_truth_labels):
    """
    Use Random Forest to classify matches with ground truth labels.
    This is proper supervised learning approach.
    """
    print(f"\n=== SUPERVISED RANDOM FOREST CLASSIFICATION ===")
    print(f"Total feature vectors: {len(features)}")
    
    if len(features) == 0:
        print("No features to classify!")
        return pd.Index([]), features
    
    # Handle missing values
    features_clean = features.fillna(0)
    
    # Align ground truth labels with features
    # Assuming ground_truth_labels is indexed by the same MultiIndex as features
    aligned_labels = ground_truth_labels.reindex(features_clean.index, fill_value=0)
    
    print(f"Ground truth statistics:")
    print(f"  True matches (1): {sum(aligned_labels == 1)}")
    print(f"  Non-matches (0): {sum(aligned_labels == 0)}")
    print(f"  Match rate: {sum(aligned_labels == 1) / len(aligned_labels) * 100:.1f}%")
    
    if sum(aligned_labels == 1) < 2:
        print("Not enough positive examples in ground truth!")
        return pd.Index([]), pd.DataFrame()
    
    # Split data for training and testing
    from sklearn.model_selection import train_test_split
    
    X_train, X_test, y_train, y_test = train_test_split(
        features_clean, aligned_labels, 
        test_size=0.3, 
        random_state=42, 
        stratify=aligned_labels if len(set(aligned_labels)) > 1 else None
    )
    
    print(f"Training set: {len(X_train)} samples")
    print(f"Test set: {len(X_test)} samples")
    
    # Train Random Forest with ground truth labels
    rf = RandomForestClassifier(
        n_estimators=500,
        max_depth=15,               # Can be deeper with real labels
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        bootstrap=True,
        oob_score=True,
        random_state=42,
        class_weight='balanced',
        n_jobs=-1
    )
    
    print("Training Random Forest with ground truth labels...")
    rf.fit(X_train, y_train)
    
    # Model performance metrics
    print(f"Out-of-bag score: {rf.oob_score_:.3f}")
    
    # Evaluate on test set
    from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
    
    y_pred = rf.predict(X_test)
    y_pred_proba = rf.predict_proba(X_test)[:, 1]
    
    print(f"\n=== MODEL PERFORMANCE ===")
    print(f"Test set accuracy: {rf.score(X_test, y_test):.3f}")
    print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba):.3f}")
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=['Non-Match', 'Match']))
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    print(f"                 Predicted")
    print(f"Actual    Non-Match  Match")
    print(f"Non-Match    {cm[0,0]:6d}  {cm[0,1]:5d}")
    print(f"Match        {cm[1,0]:6d}  {cm[1,1]:5d}")
    
    # Predict on all data
    print(f"\n=== PREDICTING ON ALL DATA ===")
    all_predictions = rf.predict(features_clean)
    all_probabilities = rf.predict_proba(features_clean)[:, 1]
    
    # Use probability threshold for final matches
    prob_threshold = 0.5  # Standard threshold, can adjust based on precision/recall needs
    final_matches = all_probabilities >= prob_threshold
    
    matches = features_clean[final_matches]
    
    print(f"Matches predicted: {len(matches)}")
    print(f"Average confidence for matches: {all_probabilities[final_matches].mean():.3f}")
    
    # Feature importance
    feature_importance = pd.DataFrame({
        'feature': features_clean.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nFeature Importance:")
    print(feature_importance)
    
    # Return matches with their confidence scores
    match_confidence = pd.Series(all_probabilities[final_matches], index=matches.index)
    
    return matches.index, matches, match_confidence

# def classify_with_random_forest(features, threshold_percentile=80):
#     """
#     Use Random Forest to classify matches.
#     Since we don't have labeled training data, we'll use an unsupervised approach
#     where high-scoring pairs are treated as positive examples.
#     """
#     print(f"\n=== RANDOM FOREST CLASSIFICATION ===")
#     print(f"Total feature vectors: {len(features)}")
    
#     if len(features) == 0:
#         print("No features to classify!")
#         return pd.Index([]), features  # Return empty features too
    
#     # Handle missing values
#     features_clean = features.fillna(0)
    
#     # IMPROVED FILTERING: Apply strict name matching requirements
#     print("Applying strict name matching filters...")
    
#     # Rule 1: Forename similarity must be above threshold for valid matches
#     name_threshold = 0.7  # Minimum similarity for names
#     valid_name_matches = (features_clean.get('forename_similarity', 0) >= name_threshold)
    
#     print(f"Records with forename similarity >= {name_threshold}: {sum(valid_name_matches)}")
    
#     # Rule 2: Must have surname match OR very high forename similarity
#     surname_or_strong_name = (
#         (features_clean.get('surname_match', 0) == 1) | 
#         (features_clean.get('forename_similarity', 0) >= 0.9)
#     )
    
#     print(f"Records with surname match OR strong forename match: {sum(surname_or_strong_name)}")
    
#     # Rule 3: Combine both requirements
#     name_requirements = valid_name_matches & surname_or_strong_name
#     print(f"Records meeting name requirements: {sum(name_requirements)}")
    
#     if sum(name_requirements) == 0:
#         print("No records meet the strict name matching requirements!")
#         return pd.Index([]), pd.DataFrame()  # Return empty results
    
#     # Filter features to only include records meeting name requirements
#     features_filtered = features_clean[name_requirements]
#     print(f"Features after name filtering: {len(features_filtered)}")
    
#     # Create pseudo-labels based on feature sum (unsupervised approach)
#     feature_sums = features_filtered.sum(axis=1)
#     threshold = np.percentile(feature_sums, threshold_percentile)
#     pseudo_labels = (feature_sums >= threshold).astype(int)
    
#     print(f"Threshold (95th percentile): {threshold:.2f}")
#     print(f"Positive examples: {sum(pseudo_labels)}")
    
#     if sum(pseudo_labels) < 2:
#         print("Not enough positive examples for Random Forest training")
#         # Fall back to simple threshold with name requirements
#         simple_threshold = features_filtered.sum(axis=1).quantile(0.9)
#         matches = features_filtered[feature_sums > simple_threshold]
#         print(f"Matches found (fallback with name filter): {len(matches)}")
#         return matches.index, matches  # Return both matches and filtered features
    
#     # Train Random Forest with enhanced parameters
#     rf = RandomForestClassifier(
#         n_estimators=500,           # Increased trees for better performance
#         max_depth=10,               # Prevent overfitting while allowing complexity
#         min_samples_split=5,        # Require minimum samples to split nodes
#         min_samples_leaf=2,         # Minimum samples in leaf nodes
#         max_features='sqrt',        # Use square root of features for each tree
#         bootstrap=True,             # Bootstrap sampling for diversity
#         oob_score=True,             # Out-of-bag score for model evaluation
#         random_state=42,            # Reproducible results
#         class_weight='balanced',    # Handle class imbalance
#         n_jobs=-1                   # Use all available processors
#     )
#     rf.fit(features_filtered, pseudo_labels)
    
#     # Display model performance metrics
#     print(f"Out-of-bag score: {rf.oob_score_:.3f}")
#     print(f"Number of features used: {rf.n_features_in_}")
#     print(f"Training completed with {rf.n_estimators} trees")
    
#     # Predict probabilities
#     probabilities = rf.predict_proba(features_filtered)[:, 1]
    
#     # Use a high probability threshold for matches
#     prob_threshold = 0.8
#     predicted_matches = probabilities >= prob_threshold
    
#     matches = features_filtered[predicted_matches]
    
#     print(f"Matches found with Random Forest (prob >= {prob_threshold}): {len(matches)}")
    
#     # Print feature importance
#     feature_importance = pd.DataFrame({
#         'feature': features_filtered.columns,
#         'importance': rf.feature_importances_
#     }).sort_values('importance', ascending=False)
    
#     print("\nFeature Importance:")
#     print(feature_importance)
    
#     return matches.index, matches  # Return both match indices and the filtered features


# old possible version
# def classify_with_random_forest(features, threshold_percentile=95):
#     """
#     Use Random Forest to classify matches.
#     Since we don't have labeled training data, we'll use an unsupervised approach
#     where high-scoring pairs are treated as positive examples.
#     """
#     print(f"\n=== RANDOM FOREST CLASSIFICATION ===")
#     print(f"Total feature vectors: {len(features)}")
    
#     if len(features) == 0:
#         print("No features to classify!")
#         return pd.Index([])
    
#     # Handle missing values
#     features_clean = features.fillna(0)
    
#     # IMPROVED FILTERING: Apply strict name matching requirements
#     print("Applying strict name matching filters...")
    
#     # Rule 1: Forename similarity must be above threshold for valid matches
#     name_threshold = 0.7  # Minimum similarity for names
#     valid_name_matches = (features_clean.get('forename_similarity', 0) >= name_threshold)
    
#     print(f"Records with forename similarity >= {name_threshold}: {sum(valid_name_matches)}")
    
#     # Rule 2: Must have surname match OR very high forename similarity
#     surname_or_strong_name = (
#         (features_clean.get('surname_match', 0) == 1) | 
#         (features_clean.get('forename_similarity', 0) >= 0.9)
#     )
    
#     print(f"Records with surname match OR strong forename match: {sum(surname_or_strong_name)}")
    
#     # Rule 3: Combine both requirements
#     name_requirements = valid_name_matches & surname_or_strong_name
#     print(f"Records meeting name requirements: {sum(name_requirements)}")
    
#     if sum(name_requirements) == 0:
#         print("No records meet the strict name matching requirements!")
#         return pd.Index([])
    
#     # Filter features to only include records meeting name requirements
#     features_filtered = features_clean[name_requirements]
#     print(f"Features after name filtering: {len(features_filtered)}")
    
#     # Create pseudo-labels based on feature sum (unsupervised approach)
#     feature_sums = features_filtered.sum(axis=1)
#     threshold = np.percentile(feature_sums, threshold_percentile)
#     pseudo_labels = (feature_sums >= threshold).astype(int)
    
#     print(f"Threshold (95th percentile): {threshold:.2f}")
#     print(f"Positive examples: {sum(pseudo_labels)}")
    
#     if sum(pseudo_labels) < 2:
#         print("Not enough positive examples for Random Forest training")
#         # Fall back to simple threshold with name requirements
#         simple_threshold = features_filtered.sum(axis=1).quantile(0.9)
#         matches = features_filtered[feature_sums > simple_threshold]
#         print(f"Matches found (fallback with name filter): {len(matches)}")
#         return matches.index
    
#     # Train Random Forest with enhanced parameters
#     rf = RandomForestClassifier(
#         n_estimators=500,           # Increased trees for better performance
#         max_depth=10,               # Prevent overfitting while allowing complexity
#         min_samples_split=5,        # Require minimum samples to split nodes
#         min_samples_leaf=2,         # Minimum samples in leaf nodes
#         max_features='sqrt',        # Use square root of features for each tree
#         bootstrap=True,             # Bootstrap sampling for diversity
#         oob_score=True,             # Out-of-bag score for model evaluation
#         random_state=42,            # Reproducible results
#         class_weight='balanced',    # Handle class imbalance
#         n_jobs=-1                   # Use all available processors
#     )
#     rf.fit(features_filtered, pseudo_labels)
    
#     # Display model performance metrics
#     print(f"Out-of-bag score: {rf.oob_score_:.3f}")
#     print(f"Number of features used: {rf.n_features_in_}")
#     print(f"Training completed with {rf.n_estimators} trees")
    
#     # Predict probabilities
#     probabilities = rf.predict_proba(features_filtered)[:, 1]
    
#     # Use a high probability threshold for matches
#     prob_threshold = 0.8
#     predicted_matches = probabilities >= prob_threshold
    
#     matches = features_filtered[predicted_matches]
    
#     print(f"Matches found with Random Forest (prob >= {prob_threshold}): {len(matches)}")
    
#     # Print feature importance
#     feature_importance = pd.DataFrame({
#         'feature': features_filtered.columns,
#         'importance': rf.feature_importances_
#     }).sort_values('importance', ascending=False)
    
#     print("\nFeature Importance:")
#     print(feature_importance)
    
#     return matches.index




## Execute Linkage

### Ground-truth function

In [19]:
def create_ground_truth_birth_death(birth_df, death_df, candidates):
    """
    Create ground truth labels from CHILD_IDENTITY and DECEASED_IDENTITY columns
    """
    ground_truth = {}
    
    for left_idx, right_idx in candidates:
        # Get identity values
        birth_child_id = birth_df.iloc[left_idx].get('CHILD_IDENTITY')
        death_deceased_id = death_df.iloc[right_idx].get('DECEASED_IDENTITY')
        
        # They match if they have the same identity (and both are not null)
        if pd.notna(birth_child_id) and pd.notna(death_deceased_id):
            is_match = 1 if birth_child_id == death_deceased_id else 0
        else:
            is_match = 0  # Unknown cases treated as non-matches
            
        ground_truth[(left_idx, right_idx)] = is_match
    
    return pd.Series(ground_truth)

# Usage:
bd_ground_truth = create_ground_truth_birth_death(birth_clean, death_clean, bd_candidates)

In [20]:

# Perform all linkages
try:
    # Birth-Death linkage
    bd_features, bd_candidates = link_birth_death(birth_clean, death_clean)
    bd_matches, bd_features_filtered = classify_with_random_forest_supervised(bd_features, bd_ground_truth)
    
    # bd_matches = classify_with_random_forest(bd_features)
    
    print(f"\n=== BIRTH-DEATH LINKAGE RESULTS ===")
    print(f"Total candidate pairs: {len(bd_candidates)}")
    print(f"Matched pairs: {len(bd_matches)}")
    
except Exception as e:
    print(f"Error in birth-death linkage: {e}")




=== LINKING BIRTH AND DEATH RECORDS ===
Candidate pairs: 69733
Features computed for 69733 pairs

=== SUPERVISED RANDOM FOREST CLASSIFICATION ===
Total feature vectors: 69733
Ground truth statistics:
  True matches (1): 1001
  Non-matches (0): 68732
  Match rate: 1.4%
Training set: 48813 samples
Test set: 20920 samples
Training Random Forest with ground truth labels...
Out-of-bag score: 0.997

=== MODEL PERFORMANCE ===
Test set accuracy: 0.997
ROC AUC Score: 0.999

Classification Report:
              precision    recall  f1-score   support

   Non-Match       1.00      1.00      1.00     20620
       Match       0.82      1.00      0.90       300

    accuracy                           1.00     20920
   macro avg       0.91      1.00      0.95     20920
weighted avg       1.00      1.00      1.00     20920


Confusion Matrix:
                 Predicted
Actual    Non-Match  Match
Non-Match     20556     64
Match             1    299

=== PREDICTING ON ALL DATA ===
Matches predicted: 1

#### Birth-Death execute

## Evaluation

In [21]:

def inspect_matches(features, matches_idx, left_df, right_df, candidates, n_samples=10):
    """Inspect sample matches to verify quality"""
    if len(matches_idx) == 0:
        print("No matches to inspect")
        return
        
    print(f"\n🔍 INSPECTING {min(n_samples, len(matches_idx))} SAMPLE MATCHES")
    print(f"Total matches available: {len(matches_idx)}")
    
    # Calculate total scores for sorting
    total_scores = features.loc[matches_idx].sum(axis=1)
    
    # Sort matches by total score (ascending to show lowest first)
    sorted_matches = total_scores.sort_values(ascending=True)
    
    print(f"Score range: {total_scores.min():.3f} to {total_scores.max():.3f}")
    print("=" * 80)
    
    # Convert matches_idx to list if it's an Index
    if hasattr(sorted_matches.index, 'tolist'):
        sample_matches = sorted_matches.index.tolist()[:min(n_samples, len(sorted_matches))]
    else:
        sample_matches = list(sorted_matches.index)[:min(n_samples, len(sorted_matches))]
    
    for i, match_idx in enumerate(sample_matches):
        try:
            total_score = sorted_matches.loc[match_idx]
            
            print(f"\n📋 MATCH {i+1} of {len(sample_matches)} (TOTAL SCORE: {total_score:.3f})")
            if i == 0:
                print("🔻 LOWEST SCORING MATCH")
            elif i == len(sample_matches) - 1 and len(sample_matches) > 1:
                print("🔺 HIGHEST SCORING MATCH")
            print("-" * 60)
            
            # The match_idx is already a tuple (left_idx, right_idx) from the MultiIndex
            if isinstance(match_idx, tuple) and len(match_idx) == 2:
                left_idx, right_idx = match_idx
            else:
                # If it's not a tuple, it might be a single index we need to look up
                left_idx, right_idx = candidates[match_idx]
            
            print(f"Match Index: {match_idx} | Left Record: {left_idx} | Right Record: {right_idx}")
            
            # Get feature scores for this match
            if match_idx in features.index:
                feature_scores = features.loc[match_idx].to_dict()
                
                print(f"\n📊 FEATURE SCORES:")
                # Sort features by score to highlight weak links
                sorted_features = sorted(feature_scores.items(), key=lambda x: x[1])
                
                for feature, score in sorted_features:
                    # Add indicators for very low scores
                    indicator = ""
                    if score == 0:
                        indicator = " ❌ (NO MATCH)"
                    elif score < 0.3:
                        indicator = " ⚠️ (WEAK)"
                    elif score >= 0.8:
                        indicator = " ✅ (STRONG)"
                    
                    print(f"  {feature:25}: {score:.3f}{indicator}")
                print(f"  {'TOTAL SCORE':25}: {sum(feature_scores.values()):.3f}")
            else:
                print("⚠️ Feature scores not available for this match")
            
            # Show relevant columns from both records
            left_record = left_df.iloc[left_idx]
            right_record = right_df.iloc[right_idx]
            
            print(f"\n📋 RECORD COMPARISON:")
            print(f"{'Field':<25} {'Birth Record':<25} {'Death Record':<25} {'Match'}")
            print("-" * 90)
            
            # Field mappings for birth-death linkage
            field_mappings = [
                ("child's forname(s)", "forename(s) of deceased", "👤 Name"),
                ("child's surname", "surname of deceased", "👨‍👩‍👧‍👦 Surname"),
                ("birth year", "year", "📅 Year"),
                ("father's forename", "father's forename", "👨 Father's Name"),
                ("father's surname", "father's surname", "👨 Father's Surname"),
                ("mother's forename", "mother's forename", "👩 Mother's Name"),
                ("mother's maiden surname", "mother's maiden surname", "👩 Mother's Maiden"),
                ("address", "address", "🏠 Address"),
                ("sex", "sex", "⚥ Sex")
            ]
            
            for left_col, right_col, display_name in field_mappings:
                left_val = 'N/A'
                right_val = 'N/A'
                
                if left_col in left_df.columns:
                    left_val = str(left_record.get(left_col, 'N/A')).strip() if pd.notna(left_record.get(left_col)) else 'N/A'
                
                if right_col in right_df.columns:
                    right_val = str(right_record.get(right_col, 'N/A')).strip() if pd.notna(right_record.get(right_col)) else 'N/A'
                
                # Highlight matches with more detailed indicators
                if left_val.lower() == right_val.lower() and left_val != 'N/A':
                    match_indicator = "✅ EXACT"
                elif left_val != 'N/A' and right_val != 'N/A':
                    # Check partial similarity for strings
                    if len(left_val) > 0 and len(right_val) > 0:
                        if left_val.lower() in right_val.lower() or right_val.lower() in left_val.lower():
                            match_indicator = "🔶 PARTIAL"
                        else:
                            match_indicator = "❌ DIFFERENT"
                    else:
                        match_indicator = "❌ DIFFERENT"
                else:
                    match_indicator = "⚪ MISSING"
                
                print(f"{display_name:<25} {left_val:<25} {right_val:<25} {match_indicator}")
            
            # Add death-specific fields
            if "age at death" in right_df.columns:
                age_val = str(right_record.get("age at death", 'N/A')).strip() if pd.notna(right_record.get("age at death")) else 'N/A'
                print(f"{'💀 Age at Death':<25} {'N/A':<25} {age_val:<25} {'⚪ DEATH ONLY'}")
            
            if "marital status" in right_df.columns:
                marital_val = str(right_record.get("marital status", 'N/A')).strip() if pd.notna(right_record.get("marital status")) else 'N/A'
                print(f"{'💒 Marital Status':<25} {'N/A':<25} {marital_val:<25} {'⚪ DEATH ONLY'}")
            
            # Calculate potential age consistency
            try:
                birth_year = pd.to_numeric(left_record.get('birth year', 0), errors='coerce')
                death_year = pd.to_numeric(right_record.get('year', 0), errors='coerce')
                age_at_death = pd.to_numeric(right_record.get('age at death', 0), errors='coerce')
                
                if birth_year > 0 and death_year > 0:
                    calculated_age = death_year - birth_year
                    
                    print(f"\n🎂 AGE CONSISTENCY ANALYSIS:")
                    print(f"  📅 Birth Year          : {int(birth_year)}")
                    print(f"  ⚰️ Death Year          : {int(death_year)}")
                    print(f"  🧮 Calculated Age      : {int(calculated_age)}")
                    print(f"  📜 Recorded Age at Death: {int(age_at_death) if age_at_death > 0 else 'N/A'}")
                    
                    if age_at_death > 0:
                        age_diff = abs(calculated_age - age_at_death)
                        print(f"  📏 Age Difference      : {int(age_diff)} years")
                        
                        if age_diff <= 2:
                            status = "✅ EXCELLENT age consistency"
                        elif age_diff <= 5:
                            status = "⚠️ MODERATE age consistency"
                        else:
                            status = "❌ POOR age consistency"
                        print(f"  📊 Assessment          : {status}")
                        
                        # Specific guidance for low-scoring matches
                        if total_score < 6 and age_diff > 3:
                            print(f"  🚨 WARNING: Low total score + poor age consistency suggests FALSE POSITIVE")
                            
            except:
                print(f"\n🎂 AGE CONSISTENCY ANALYSIS:")
                print(f"  ⚠️ Could not calculate age consistency")
                    
        except Exception as e:
            print(f"\n❌ Error inspecting match {i+1}")
            print(f"Error: {e}")
            print(f"Match index type: {type(match_idx)}")
            print(f"Features index type: {type(features.index)}")
            continue
    
    print(f"\n" + "=" * 80)
    print(f"✅ INSPECTION COMPLETE")
    print(f"{len(sample_matches)} matches reviewed out of {len(matches_idx)} total matches")
    print(f"📊 Showing LOWEST to HIGHEST scoring matches")
    print("=" * 80)

# Example: Inspect birth-death matches using filtered features
# if 'bd_matches' in locals() and len(bd_matches) > 0:
#     inspect_matches(bd_features_filtered, bd_matches, birth_clean, death_clean, bd_candidates, n_samples=10)

if 'bd_matches' in locals() and len(bd_matches) > 0:
    # For inspection, we need to get the filtered features that the classifier used
    bd_features_clean = bd_features.fillna(0)
    bd_valid_names = (bd_features_clean.get('forename_similarity', 0) >= 0.7)
    bd_features_filtered = bd_features_clean[bd_valid_names]
    
    inspect_matches(bd_features_filtered, bd_matches, birth_clean, death_clean, bd_candidates, n_samples=10)


print("\n=== LINKAGE ANALYSIS COMPLETE ===")


=== LINKAGE ANALYSIS COMPLETE ===
