In [38]:
import pandas as pd
import numpy as np
import time

# generate the medical diagnostic data
start_time = time.perf_counter()
def generate_health_dataset(
    # generate the scale of the data set
    num_rows=100_000, 
    num_unique_ids=20_000,
    output_file='1health_data.csv'):
    
    # generate gender and attributes based on gender
    gender = np.random.choice(['male', 'female'], num_unique_ids)
    params = {
        'male': {
            'blood_pressure': (140, 8, 1),
            'height': (1.70, 0.05, 2),
            'weight': (75, 6, 1)
        },
        'female': {
            'blood_pressure': (110, 7, 1),
            'height': (1.60, 0.05, 2),
            'weight': (55, 5, 1)
        }
    }

    # generate random number of visits for one same patient
    base_ids = np.arange(1, num_unique_ids + 1)
    repeat_counts = np.random.multinomial(num_rows, [1/num_unique_ids]*num_unique_ids)
    ids = np.repeat(base_ids, repeat_counts)
    
    def generate_values(gender_arr, param_type):
    # generate data frame of base data
        male_params = params['male'][param_type]
        female_params = params['female'][param_type]
        
        return np.where(
            gender_arr == 'male',
            np.random.normal(male_params[0], male_params[1], num_unique_ids).round(male_params[2]),
            np.random.normal(female_params[0], female_params[1], num_unique_ids).round(female_params[2])
        )

    base_data = pd.DataFrame({
        'ID': base_ids,
        'gender': gender,
        'base_age': np.random.randint(18, 80, num_unique_ids),
        'base_blood_pressure': generate_values(gender, 'blood_pressure'),
        'base_height': generate_values(gender, 'height'),
        'base_weight': generate_values(gender, 'weight'),
        'base_disease': np.random.randint(0, 2, num_unique_ids).astype(int)
    })
    df = pd.DataFrame({'ID': ids}).merge(base_data, on='ID', how='left')

    # generate random data from the same patient and attributes' changes based on the date
    date_rng = pd.date_range(start='2020-01-01', end='2025-12-31')
    df['date'] = np.concatenate([
        np.sort(np.random.choice(date_rng, size=count, replace=False))
        for count in np.bincount(ids) if count > 0
    ])

    df['age'] = df['base_age'] + (df['date'].dt.year - 2020)

    noise_config = {
        'blood_pressure': (0, 3, 1),
        'height': (0, 0.05, 2),
        'weight': (0, 1.5, 1)
    }
    
    for column in ['blood_pressure', 'height', 'weight']:
        base_column = f'base_{column}'
        average, standard_deviation, decimals = noise_config[column]
        df[column] = (df[base_column] + np.random.normal(average, standard_deviation, len(df))).round(decimals)

    # disease reversal possibility
    df['disease'] = np.where(
        np.random.rand(len(df)) < 0.2,
        1 - df['base_disease'],
        df['base_disease']
    )

    # delete the base fields and sort by ID and date
    df = df.drop(columns=['base_age', 'base_blood_pressure', 
                     'base_height', 'base_weight', 'base_disease'])
    
    # generate 5% clearly wrong data in specific attributes
    error_rules = {
        'blood_pressure': lambda size: np.where(
            np.random.rand(size) < 0.5,
            np.random.randint(300, 500, size),
            np.random.randint(0, 50, size)
        ),
        'height': lambda size: np.random.choice([-1.0, 4.5], size=size),
        'weight': lambda size: np.random.choice([-10, 800], size=size),
        'disease': lambda size: np.random.choice([-1, 2], size=size),
    }

    target_cols = df.columns.difference(['ID', 'gender', 'date', 'age'])

    for col in target_cols:
        mask = np.random.rand(len(df)) < 0.05
        num_errors = mask.sum()
        errors = error_rules[col](num_errors)
        df.loc[mask, col] = errors
    
    # generate 5% random data missing
    cols_to_missing = df.columns.difference(['ID', 'date'])
    mask = np.random.rand(*df[cols_to_missing].shape) < 0.05
    df[cols_to_missing] = df[cols_to_missing].mask(mask)

    # calculate the BMI
    df['BMI'] = (df.weight / (df.height ** 2)).round(1)

    # sort by ID and date
    df = df.sort_values(['ID', 'date']).reset_index(drop=True)

    # save the generated data set as csv file
    if output_file:
        df.to_csv(
            output_file,
            index=False,
            sep=',',
            encoding='utf-8',
            float_format='%.1f'
        )
    # count time
    elapsed = time.perf_counter() - start_time
    return df, round(elapsed, 5)

if __name__ == "__main__":
    df, run_time = generate_health_dataset()
    print(f"total {run_time} seconds")
    print(df.head(20))

total 1.08677 seconds
    ID  gender       date   age  blood_pressure  height  weight  disease   BMI
0    1  female 2020-06-18  26.0           122.5    1.65    57.1      1.0  21.0
1    1  female 2020-10-25  26.0           113.6    1.63     NaN      1.0   NaN
2    1  female 2022-01-11  28.0           122.8    1.66    58.1      0.0  21.1
3    2     NaN 2021-05-08  79.0           115.9    1.57   -10.0      0.0  -4.1
4    2  female 2021-08-25  79.0           117.9    1.61    58.4      0.0  22.5
5    2  female 2022-10-18  80.0           116.6    1.57    56.5      0.0  22.9
6    2  female 2024-09-11  82.0           119.4    1.65    56.5      1.0  20.8
7    2  female 2025-05-12  83.0           115.6    1.73    57.4      0.0  19.2
8    3  female 2020-04-02  59.0             NaN    1.58     NaN      1.0   NaN
9    3  female 2020-08-17  59.0           112.4   -1.00    52.5      1.0  52.5
10   3  female 2021-02-26  60.0           108.5    1.63    49.2      2.0  18.5
11   3  female 2021-03-02  60.

In [39]:
# correct the missing age and gender according to the same ID and date
import pandas as pd
from datetime import datetime
import numpy as np
import time

def correct_age_gender(df):
# correct the gender
    df["gender"] = df.groupby("ID")["gender"].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x
)
    df = df.groupby("ID").filter(
    lambda group: ~group["gender"].isna().all()
)
# correct the age
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    grouped = df.groupby('ID')
    
    processed_dfs = []
    
    for id, group in grouped:
        # group by id and sort by date and age
        group = group.sort_values('year').reset_index(drop=True)
        ages = group['age'].copy()
        years = group['year']

        for i in range(len(group)):
            # scan every empty space of age, call functions to get the previous and next age of the selected one
            if pd.isna(ages[i]):
                prev_age, prev_year = find_previous_value(ages, years, i)
                next_age, next_year = find_next_value(ages, years, i)
                
                # different cases of nearest valid information
                if prev_age is not None and next_age is not None:
                    year_diff = next_year - prev_year
                    if year_diff == 0:
                        if prev_age == next_age:
                            ages[i] = prev_age
                        else:
                            ages[i] = (prev_age + next_age) // 2
                    else:
                        exact_age = prev_age + ((years[i] - prev_year) / year_diff) * (next_age - prev_age)
                        ages[i] = int(round(exact_age))
                elif prev_age is not None:
                    ages[i] = prev_age + (years[i] - prev_year)
                elif next_age is not None:
                    ages[i] = next_age - (next_year - years[i])
                    
        # sort to find the first valid age
        first_valid_idx = ages.first_valid_index()
        if first_valid_idx is not None and first_valid_idx > 0:
            first_valid_age = ages[first_valid_idx]
            first_valid_year = years[first_valid_idx]
            # reverse loop to get the former id without age
            for i in range(first_valid_idx-1, -1, -1):
                ages[i] = first_valid_age - (first_valid_year - years[i])

        # sort to find the last valid age
        last_valid_idx = ages.last_valid_index()
        if last_valid_idx is not None and last_valid_idx < len(ages)-1:
            last_valid_age = ages[last_valid_idx]
            last_valid_year = years[last_valid_idx]
            # loop to get the last id without age if needed
            for i in range(last_valid_idx+1, len(ages)):
                ages[i] = last_valid_age + (years[i] - last_valid_year)
        
        group['age'] = ages.astype('Int64')
        processed_dfs.append(group)
    
    final_df = pd.concat(processed_dfs)
    df = final_df.drop(columns=["year"])
    return df

# find the nearest date and age information in the same IDs
def find_previous_value(ages, years, current_idx):
    for i in range(current_idx-1, -1, -1):
        if not pd.isna(ages[i]):
            return ages[i], years[i]
    return None, None

def find_next_value(ages, years, current_idx):
    for i in range(current_idx+1, len(ages)):
        if not pd.isna(ages[i]):
            return ages[i], years[i]
    return None, None

if __name__ == '__main__':
    total_start = time.perf_counter()
    df = pd.read_csv("1health_data.csv")
    processed_df = correct_age_gender(df)
    processed_df.to_csv("2processed_health_data.csv", index=False)
    total_end = time.perf_counter()
    total_run_time = total_end - total_start
    print(f"Total run time: {total_run_time} seconds")
    print(processed_df.head(20))

Total run time: 12.30446099999972 seconds
   ID  gender       date  age  blood_pressure  height  weight  disease   BMI
0   1  female 2020-06-18   26           122.5     1.6    57.1      1.0  21.0
1   1  female 2020-10-25   26           113.6     1.6     NaN      1.0   NaN
2   1  female 2022-01-11   28           122.8     1.7    58.1      0.0  21.1
0   2  female 2021-05-08   79           115.9     1.6   -10.0      0.0  -4.1
1   2  female 2021-08-25   79           117.9     1.6    58.4      0.0  22.5
2   2  female 2022-10-18   80           116.6     1.6    56.5      0.0  22.9
3   2  female 2024-09-11   82           119.4     1.6    56.5      1.0  20.8
4   2  female 2025-05-12   83           115.6     1.7    57.4      0.0  19.2
0   3  female 2020-04-02   59             NaN     1.6     NaN      1.0   NaN
1   3  female 2020-08-17   59           112.4    -1.0    52.5      1.0  52.5
2   3  female 2021-02-26   60           108.5     1.6    49.2      2.0  18.5
3   3  female 2021-03-02   60     

In [40]:
# restrict all the dataset in the correct range in order to delete rows contained wrong and missing data
import pandas as pd

df = pd.read_csv("2processed_health_data.csv")
valid_conditions = (
    df["age"].between(10, 100)
    & df["height"].between(0.5, 2.5)
    & df["weight"].between(20, 300)
    & (df["blood_pressure"].between(60, 200))
    & df["disease"].isin([0, 1])
    & df["gender"].isin(["male", "female"])
)

cleaned_df = df[valid_conditions]

cleaned_df.to_csv("3cleaned_health_data.csv", index=False)
print(cleaned_df.head(20))

    ID  gender        date   age  blood_pressure  height  weight  disease  \
0    1  female  2020-06-18  26.0           122.5     1.6    57.1      1.0   
2    1  female  2022-01-11  28.0           122.8     1.7    58.1      0.0   
4    2  female  2021-08-25  79.0           117.9     1.6    58.4      0.0   
5    2  female  2022-10-18  80.0           116.6     1.6    56.5      0.0   
6    2  female  2024-09-11  82.0           119.4     1.6    56.5      1.0   
7    2  female  2025-05-12  83.0           115.6     1.7    57.4      0.0   
14   3  female  2025-12-15  64.0           111.0     1.6    51.3      1.0   
15   4    male  2020-06-13  65.0           141.1     1.6    76.1      1.0   
17   4    male  2021-07-15  66.0           135.9     1.6    79.3      1.0   
21   6    male  2020-07-02  30.0           142.8     1.6    84.0      0.0   
22   6    male  2021-05-12  31.0           145.7     1.7    82.7      0.0   
23   6    male  2023-03-03  33.0           148.7     1.7    86.1      0.0   

In [41]:
import pandas as pd
def split_and_save_data(
    file_path: str,
    target_column: str,
    split_conditions: dict,
) -> None:
    
    df = pd.read_csv(file_path)
    df_cleaned = df.copy()

    for filename, condition_func in split_conditions.items():
        mask = condition_func(df_cleaned[target_column])
        split_df = df_cleaned[mask]
        split_df.to_csv(filename, index=False)

# split the data set by disease condition
disease_conditions = {
    "no_disease.csv": lambda x: x == 0,
    "disease.csv": lambda x: x == 1
}

split_and_save_data(
    file_path = "3cleaned_health_data.csv",
    target_column = df.columns[7],
    split_conditions = disease_conditions,
)

# split the disease and no_disease data set by gender
split_and_save_data(
    file_path="disease.csv",
    target_column = df.columns[1],
    split_conditions={
        "male_disease.csv": lambda x: x == "male",
        "female_disease.csv": lambda x: x == "female"
    }
)

split_and_save_data(
    file_path="no_disease.csv",
    target_column = df.columns[1],
    split_conditions={
        "male_no_disease.csv": lambda x: x == "male",
        "female_no_disease.csv": lambda x: x == "female"
    }
)

In [4]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment

df_disease = pd.read_csv("disease.csv")
df_no_disease = pd.read_csv("no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df['gender'] = df['gender'].map({'female': 0, 'male': 1})
    return df[features + ['ID', 'date']]

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv

In [43]:
def greedy_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    start_time = time.perf_counter()
    matches = []
    used_indices = set() if enable_deduplication else None

    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]

# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    return pd.DataFrame(matches), elapsed_time

def main():

    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    output_path = "match_greedy.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df, elapsed_time = greedy_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=True
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

    total_distance = matches_df['Distance'].sum()
    print(f"Total distance: {total_distance:.4f}")

    print(f"Total time: {elapsed_time} seconds")

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            1   2020-06-18          17005      2024-10-14  0.052047
1            2   2024-09-11           3729      2022-04-18  0.040974
2            3   2025-12-15           6881      2021-12-11  0.039838
3            4   2020-06-13           9124      2025-02-23  0.063913
4            4   2021-07-15            748      2022-01-14  0.056420
5            7   2023-06-21           1407      2025-10-01  0.053924
6            8   2020-06-22          12457      2024-02-16  0.080968
7            9   2024-01-08          11406      2021-07-07  0.032691
8            9   2025-02-10          13948      2023-08-14  0.046451
9           10   2020-04-21          15901      2025-11-24  0.044970
10          12   2025-07-18          14218      2025-03-13  0.039009
11          13   2021-04-01          13091      2021-11-26  0.040198
12          13   2023-08-15           6542      2024-02-09  0.065627
13          13   2024-08-23       

In [None]:
def quick_match(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    start_time = time.perf_counter()
    matches = []
    used_indices = set() if enable_deduplication else None


    for i in range(len(disease_data)):

        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))

        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

# choose the matched target randomly
        candidates = candidates.sample(frac=1)
        first_candidate_idx = candidates.index[0]
        
        candidate_features = no_disease_features.loc[first_candidate_idx].values.reshape(1, -1)
        distance = cdist(
            current_features,
            candidate_features,
            metric='mahalanobis',
            VI=cov_inv
        )[0][0]

        
        matches.append({
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[first_candidate_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[first_candidate_idx, 'date'],
            'Distance': distance,
        })

        if enable_deduplication:
            used_indices.add(first_candidate_idx)

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    return pd.DataFrame(matches), elapsed_time

def main():
    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df, elapsed_time = quick_match(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=True
    )
    total_distance = matches_df['Distance'].sum()
    print(f"Total distance: {total_distance:.4f}")
    
    matches_df.to_csv("match_quick.csv", index=False)
    print(matches_df.head(20))


    print(f"Total time: {elapsed_time} seconds")

if __name__ == "__main__":
    main()

Total distance: 30505.7435
    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            1   2020-06-18           1015      2021-08-23  1.146671
1            2   2024-09-11           9291      2024-08-28  1.144668
2            3   2025-12-15           3196      2021-08-28  0.385724
3            4   2020-06-13           5946      2021-08-17  0.511117
4            4   2021-07-15           7430      2023-03-23  1.082180
5            7   2023-06-21           2701      2021-03-13  1.412100
6            8   2020-06-22          13592      2021-04-14  0.637323
7            9   2024-01-08           7380      2022-07-15  1.391220
8            9   2025-02-10           5164      2020-03-17  0.560209
9           10   2020-04-21          18162      2021-08-27  1.247319
10          12   2025-07-18           8703      2025-06-15  0.612143
11          13   2021-04-01          18473      2023-01-30  0.888610
12          13   2023-08-15          10199      2025-03-03  0.585722
13     

In [45]:
def optimal_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv):
    start_time = time.perf_counter()
    n = len(disease_data)
    m = len(no_disease_data)

# initiallize the matrix
    cost_matrix = np.full((n, m), np.inf)

    for i in range(n):
        disease_id = disease_data.iloc[i]['ID']
        mask = (no_disease_data['ID'] != disease_id)
        valid_j = np.where(mask)[0]
        
        if valid_j.size == 0:
            continue

        distances = cdist(
            disease_features.iloc[i].values.reshape(1, -1),
            no_disease_features.iloc[valid_j],
            metric='mahalanobis',
            VI=cov_inv
        )
        cost_matrix[i, valid_j] = distances.ravel()
# applied hungarian algorithm
    row_ind, col_ind = linear_sum_assignment(cost_matrix)

    matches = []
    for i, j in zip(row_ind, col_ind):
        cost = cost_matrix[i, j]
        if np.isinf(cost):
            continue
        
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        no_disease_id = no_disease_data.iloc[j]['ID']
        no_disease_date = no_disease_data.iloc[j]['date']
        
        matches.append({
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_id,
            'no Disease Date': no_disease_date,
            'Distance': cost
        })

    end_time = time.perf_counter()
    elapsed_time = end_time - start_time
    return pd.DataFrame(matches), elapsed_time

def main():
    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    output_path = "match_optimal.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df, elapsed_time = optimal_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))
    
    total_distance = matches_df['Distance'].sum()
    print(f"Total distance: {total_distance:.4f}")

    print(f"Total time: {elapsed_time}")

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            1   2020-06-18          12492      2024-05-14  0.080365
1            2   2024-09-11           3729      2022-04-18  0.040974
2            3   2025-12-15           6881      2021-12-11  0.039838
3            4   2020-06-13           9124      2025-02-23  0.063913
4            4   2021-07-15            748      2022-01-14  0.056420
5            7   2023-06-21          16303      2022-12-01  0.055337
6            8   2020-06-22          18073      2022-12-13  0.099548
7            9   2024-01-08          11406      2021-07-07  0.032691
8            9   2025-02-10          13948      2023-08-14  0.046451
9           10   2020-04-21          16672      2020-04-13  0.051719
10          12   2025-07-18           4823      2022-07-23  0.046232
11          13   2021-04-01          13091      2021-11-26  0.040198
12          13   2023-08-15           6542      2024-02-09  0.065627
13          13   2024-08-23       

In [46]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("male_disease.csv")
df_no_disease = pd.read_csv("male_no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df_clean = df.dropna(subset=features).reset_index(drop=True)
    return df_clean

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv


def greedy_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.3, 0.1, 0.2, 0.2])
    output_path = "male_greedy_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = greedy_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=False
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            4   2020-06-13          11009      2020-04-13  0.086840
1            4   2021-07-15            962      2024-01-28  0.075920
2            9   2024-01-08          11406      2021-07-07  0.035475
3            9   2025-02-10           5595      2022-08-25  0.042749
4           10   2020-04-21          16672      2020-04-13  0.039130
5           14   2020-04-02          11995      2022-05-29  0.035946
6           14   2020-04-09          19950      2024-01-31  0.027881
7           16   2020-05-11           4593      2020-01-24  0.036505
8           16   2022-02-13          10423      2025-07-11  0.068332
9           16   2022-03-29          17978      2024-02-01  0.032128
10          19   2020-05-08          15740      2022-12-25  0.071778
11          19   2020-09-17          17035      2024-10-08  0.023550
12          22   2020-02-11          10934      2025-09-27  0.052035
13          22   2025-11-12       

In [47]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("female_disease.csv")
df_no_disease = pd.read_csv("female_no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df_clean = df.dropna(subset=features).reset_index(drop=True)
    return df_clean

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv


def greedy_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.3, 0.1, 0.2, 0.2])
    output_path = "female_greedy_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = greedy_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=False
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            1   2020-06-18           3754      2021-10-16  0.079439
1            2   2024-09-11           3729      2022-04-18  0.046553
2            3   2025-12-15           3114      2021-04-23  0.059186
3            7   2023-06-21          16303      2022-12-01  0.039368
4            8   2020-06-22          11273      2023-10-18  0.104599
5           12   2025-07-18           4823      2022-07-23  0.049613
6           13   2021-04-01           5480      2024-09-10  0.085793
7           13   2023-08-15          13294      2024-11-17  0.087089
8           13   2024-08-23          15223      2025-07-19  0.031627
9           13   2025-03-22           7447      2021-11-18  0.098775
10          15   2021-04-27          12113      2023-11-07  0.078114
11          20   2024-07-05          15524      2024-12-23  0.046085
12          21   2024-05-13           5065      2024-05-20  0.051176
13          21   2025-06-08       