In [16]:
import pandas as pd
import numpy as np
import time

# generate the medical diagnostic data
start_time = time.perf_counter()
def generate_health_dataset(
    # generate the scale of the data set
    num_rows=100_000, 
    num_unique_ids=20_000,
    output_file='health_data.csv'):
    
    # generate gender and attributes based on gender
    gender = np.random.choice(['male', 'female'], num_unique_ids)
    params = {
        'male': {
            'blood_pressure': (140, 8, 1),
            'height': (1.70, 0.05, 2),
            'weight': (75, 6, 1)
        },
        'female': {
            'blood_pressure': (110, 7, 1),
            'height': (1.60, 0.05, 2),
            'weight': (55, 5, 1)
        }
    }

    # generate random number of visits for one same patient
    base_ids = np.arange(1, num_unique_ids + 1)
    repeat_counts = np.random.multinomial(num_rows, [1/num_unique_ids]*num_unique_ids)
    ids = np.repeat(base_ids, repeat_counts)
    
    def generate_values(gender_arr, param_type):
    # generate data frame of base data
        male_params = params['male'][param_type]
        female_params = params['female'][param_type]
        
        return np.where(
            gender_arr == 'male',
            np.random.normal(male_params[0], male_params[1], num_unique_ids).round(male_params[2]),
            np.random.normal(female_params[0], female_params[1], num_unique_ids).round(female_params[2])
        )

    base_data = pd.DataFrame({
        'ID': base_ids,
        'gender': gender,
        'base_age': np.random.randint(18, 80, num_unique_ids),
        'base_blood_pressure': generate_values(gender, 'blood_pressure'),
        'base_height': generate_values(gender, 'height'),
        'base_weight': generate_values(gender, 'weight'),
        'base_disease': np.random.randint(0, 2, num_unique_ids).astype(int)
    })
    df = pd.DataFrame({'ID': ids}).merge(base_data, on='ID', how='left')

    # generate random data from the same patient and attributes' changes based on the date
    date_rng = pd.date_range(start='2020-01-01', end='2025-12-31')
    df['date'] = np.concatenate([
        np.sort(np.random.choice(date_rng, size=count, replace=False))
        for count in np.bincount(ids) if count > 0
    ])

    df['age'] = df['base_age'] + (df['date'].dt.year - 2020)

    noise_config = {
        'blood_pressure': (0, 3, 1),
        'height': (0, 0.05, 2),
        'weight': (0, 1.5, 1)
    }
    
    for column in ['blood_pressure', 'height', 'weight']:
        base_column = f'base_{column}'
        average, standard_deviation, decimals = noise_config[column]
        df[column] = (df[base_column] + np.random.normal(average, standard_deviation, len(df))).round(decimals)

    # disease reversal possibility
    df['disease'] = np.where(
        np.random.rand(len(df)) < 0.2,
        1 - df['base_disease'],
        df['base_disease']
    )

    # delete the base fields and sort by ID and date
    df = df.drop(columns=['base_age', 'base_blood_pressure', 
                     'base_height', 'base_weight', 'base_disease'])
    
    # generate 5% clearly wrong data in specific attributes
    error_rules = {
        'blood_pressure': lambda size: np.where(
            np.random.rand(size) < 0.5,
            np.random.randint(300, 500, size),
            np.random.randint(0, 50, size)
        ),
        'height': lambda size: np.random.choice([-1.0, 4.5], size=size),
        'weight': lambda size: np.random.choice([-10, 800], size=size),
        'disease': lambda size: np.random.choice([-1, 2], size=size),
    }

    target_cols = df.columns.difference(['ID', 'gender', 'date', 'age'])

    for col in target_cols:
        mask = np.random.rand(len(df)) < 0.05
        num_errors = mask.sum()
        errors = error_rules[col](num_errors)
        df.loc[mask, col] = errors
    
    # generate 5% random data missing
    cols_to_missing = df.columns.difference(['ID', 'date'])
    mask = np.random.rand(*df[cols_to_missing].shape) < 0.05
    df[cols_to_missing] = df[cols_to_missing].mask(mask)

    # calculate the BMI
    df['BMI'] = (df.weight / (df.height ** 2)).round(1)

    # sort by ID and date
    df = df.sort_values(['ID', 'date']).reset_index(drop=True)

    # save the generated data set as csv file
    if output_file:
        df.to_csv(
            output_file,
            index=False,
            sep=',',
            encoding='utf-8',
            float_format='%.1f'
        )
    # count time
    elapsed = time.perf_counter() - start_time
    return df, round(elapsed, 5)

if __name__ == "__main__":
    df, run_time = generate_health_dataset()
    print(f"total {run_time} seconds")
    print(df.head(20))

total 1.07391 seconds
    ID  gender       date   age  blood_pressure  height  weight  disease  \
0    1    male 2020-02-05  63.0           139.0    1.73    73.1      0.0   
1    1    male 2021-02-04  64.0           130.8    1.67    74.8      0.0   
2    1    male 2021-07-31  64.0           138.2    1.81    72.5      0.0   
3    1    male 2022-08-12  65.0           141.2    1.82    73.6      0.0   
4    1    male 2023-10-01  66.0           383.0    1.79    72.2      0.0   
5    2    male 2020-05-16  67.0           136.6    1.71     NaN      NaN   
6    2    male 2021-01-05  68.0           134.1    1.64    71.5      2.0   
7    2    male 2025-01-28  72.0           134.3     NaN    74.3      0.0   
8    2    male 2025-04-03  72.0           135.3    1.69    74.4      NaN   
9    3  female 2020-06-29  39.0           122.0    1.67    45.9      1.0   
10   3  female 2022-03-28  41.0             NaN    1.61    48.1      1.0   
11   3  female 2024-10-18  43.0           131.4    1.60   800.0   

In [17]:
# correct the missing age and gender according to the same ID and date
import pandas as pd
from datetime import datetime
import numpy as np
import time

def correct_age_gender(df):
# correct the gender
    df["gender"] = df.groupby("ID")["gender"].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x
)
    df = df.groupby("ID").filter(
    lambda group: ~group["gender"].isna().all()
)
# correct the age
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    grouped = df.groupby('ID')
    
    processed_dfs = []
    
    for id, group in grouped:
        # group by id and sort by date and age
        group = group.sort_values('year').reset_index(drop=True)
        ages = group['age'].copy()
        years = group['year']

        for i in range(len(group)):
            # scan every empty space of age, call functions to get the previous and next age of the selected one
            if pd.isna(ages[i]):
                prev_age, prev_year = find_previous_value(ages, years, i)
                next_age, next_year = find_next_value(ages, years, i)
                
                # different cases of nearest valid information
                if prev_age is not None and next_age is not None:
                    year_diff = next_year - prev_year
                    if year_diff == 0:
                        if prev_age == next_age:
                            ages[i] = prev_age
                        else:
                            ages[i] = (prev_age + next_age) // 2
                    else:
                        exact_age = prev_age + ((years[i] - prev_year) / year_diff) * (next_age - prev_age)
                        ages[i] = int(round(exact_age))
                elif prev_age is not None:
                    ages[i] = prev_age + (years[i] - prev_year)
                elif next_age is not None:
                    ages[i] = next_age - (next_year - years[i])
                    
        # sort to find the first valid age
        first_valid_idx = ages.first_valid_index()
        if first_valid_idx is not None and first_valid_idx > 0:
            first_valid_age = ages[first_valid_idx]
            first_valid_year = years[first_valid_idx]
            # reverse loop to get the former id without age
            for i in range(first_valid_idx-1, -1, -1):
                ages[i] = first_valid_age - (first_valid_year - years[i])

        # sort to find the last valid age
        last_valid_idx = ages.last_valid_index()
        if last_valid_idx is not None and last_valid_idx < len(ages)-1:
            last_valid_age = ages[last_valid_idx]
            last_valid_year = years[last_valid_idx]
            # loop to get the last id without age if needed
            for i in range(last_valid_idx+1, len(ages)):
                ages[i] = last_valid_age + (years[i] - last_valid_year)
        
        group['age'] = ages.astype('Int64')
        processed_dfs.append(group)
    
    final_df = pd.concat(processed_dfs)
    df = final_df.drop(columns=["year"])
    return df

# find the nearest date and age information in the same IDs
def find_previous_value(ages, years, current_idx):
    for i in range(current_idx-1, -1, -1):
        if not pd.isna(ages[i]):
            return ages[i], years[i]
    return None, None

def find_next_value(ages, years, current_idx):
    for i in range(current_idx+1, len(ages)):
        if not pd.isna(ages[i]):
            return ages[i], years[i]
    return None, None

if __name__ == '__main__':
    total_start = time.perf_counter()
    df = pd.read_csv("health_data.csv")
    processed_df = correct_age_gender(df)
    processed_df.to_csv("processed_health_data.csv", index=False)
    total_end = time.perf_counter()
    total_run_time = total_end - total_start
    print(f"Total run time: {total_run_time:.5f} seconds")
    print(processed_df.head(20))

Total run time: 12.32096 seconds
   ID  gender       date  age  blood_pressure  height  weight  disease    BMI
0   1    male 2020-02-05   63           139.0     1.7    73.1      0.0   24.4
1   1    male 2021-02-04   64           130.8     1.7    74.8      0.0   26.8
2   1    male 2021-07-31   64           138.2     1.8    72.5      0.0   22.1
3   1    male 2022-08-12   65           141.2     1.8    73.6      0.0   22.2
4   1    male 2023-10-01   66           383.0     1.8    72.2      0.0   22.5
0   2    male 2020-05-16   67           136.6     1.7     NaN      NaN    NaN
1   2    male 2021-01-05   68           134.1     1.6    71.5      2.0   26.6
2   2    male 2025-01-28   72           134.3     NaN    74.3      0.0    NaN
3   2    male 2025-04-03   72           135.3     1.7    74.4      NaN   26.0
0   3  female 2020-06-29   39           122.0     1.7    45.9      1.0   16.5
1   3  female 2022-03-28   41             NaN     1.6    48.1      1.0   18.6
2   3  female 2024-10-18   43  

In [18]:
# restrict all the dataset in the correct range in order to delete rows contained wrong and missing data
import pandas as pd

df = pd.read_csv("processed_health_data.csv")
valid_conditions = (
    df["age"].between(10, 100)
    & df["height"].between(0.5, 2.5)
    & df["weight"].between(20, 300)
    & (df["blood_pressure"].between(60, 200))
    & df["disease"].isin([0, 1])
    & df["gender"].isin(["male", "female"])
)

cleaned_df = df[valid_conditions]

cleaned_df.to_csv("cleaned_health_data.csv", index=False)
print(cleaned_df.head(20))

    ID  gender        date   age  blood_pressure  height  weight  disease  \
0    1    male  2020-02-05  63.0           139.0     1.7    73.1      0.0   
1    1    male  2021-02-04  64.0           130.8     1.7    74.8      0.0   
2    1    male  2021-07-31  64.0           138.2     1.8    72.5      0.0   
3    1    male  2022-08-12  65.0           141.2     1.8    73.6      0.0   
9    3  female  2020-06-29  39.0           122.0     1.7    45.9      1.0   
13   3  female  2025-08-21  44.0           127.5     1.7    50.6      0.0   
15   4    male  2023-10-21  25.0           141.3     1.9    81.8      0.0   
16   5    male  2020-12-01  39.0           142.0     1.6    76.0      0.0   
17   5    male  2021-09-07  40.0           144.5     1.6    71.2      0.0   
18   5    male  2021-10-17  40.0           137.4     1.6    72.0      0.0   
19   5    male  2021-10-26  40.0           143.1     1.6    72.3      0.0   
20   5    male  2021-12-14  40.0           137.9     1.6    71.8      0.0   

In [19]:
import pandas as pd
def split_and_save_data(
    file_path: str,
    target_column: str,
    split_conditions: dict,
) -> None:
    
    df = pd.read_csv(file_path)
    df_cleaned = df.copy()

    for filename, condition_func in split_conditions.items():
        mask = condition_func(df_cleaned[target_column])
        split_df = df_cleaned[mask]
        split_df.to_csv(filename, index=False)

# split the data set by disease condition
disease_conditions = {
    "no_disease.csv": lambda x: x == 0,
    "disease.csv": lambda x: x == 1
}

split_and_save_data(
    file_path = "cleaned_health_data.csv",
    target_column = df.columns[7],
    split_conditions = disease_conditions,
)

# split the disease and no_disease data set by gender
split_and_save_data(
    file_path="disease.csv",
    target_column = df.columns[1],
    split_conditions={
        "male_disease.csv": lambda x: x == "male",
        "female_disease.csv": lambda x: x == "female"
    }
)

split_and_save_data(
    file_path="no_disease.csv",
    target_column = df.columns[1],
    split_conditions={
        "male_no_disease.csv": lambda x: x == "male",
        "female_no_disease.csv": lambda x: x == "female"
    }
)

In [26]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("disease.csv")
df_no_disease = pd.read_csv("no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df['gender'] = df['gender'].map({'female': 0, 'male': 1})
    return df[features + ['ID', 'date']]

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv

In [27]:
def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    total_time = 0.0
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]

        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        total_time += elapsed_time

# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches), total_time

def main():

    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    output_path = "exact_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df, total_time = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=True
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

    total_distance = matches_df['Distance'].sum()
    print("Total distance:", total_distance)
    print(f"Total time: {total_time} seconds")

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            3   2020-06-29           5297      2024-03-08  0.095427
1            5   2022-07-13           7617      2022-08-13  0.068708
2            5   2024-08-24          13310      2025-02-13  0.020420
3            6   2023-01-24          10772      2022-11-06  0.046558
4            6   2023-07-09          17858      2025-02-12  0.058789
5            6   2024-03-16          13336      2022-12-09  0.074670
6            6   2025-08-17          10897      2025-12-25  0.052381
7            8   2021-02-17          11248      2020-11-19  0.041541
8            8   2021-12-01          10128      2024-09-06  0.034247
9            8   2022-09-13          19581      2025-04-24  0.029912
10           8   2023-03-09          12450      2021-02-04  0.074480
11           8   2023-10-06          11518      2024-12-18  0.045872
12           8   2024-08-28           3317      2021-05-23  0.067742
13           8   2025-02-17       

In [30]:
import time
def quick_match(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    total_time = 0.0

    for i in range(len(disease_data)):

        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        start_time = time.perf_counter()

        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

# choose the matched target randomly
        candidates = candidates.sample(frac=1)
        first_candidate_idx = candidates.index[0]
        
        candidate_features = no_disease_features.loc[first_candidate_idx].values.reshape(1, -1)
        distance = cdist(
            current_features,
            candidate_features,
            metric='mahalanobis',
            VI=cov_inv
        )[0][0]

        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        total_time += elapsed_time
        
        matches.append({
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[first_candidate_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[first_candidate_idx, 'date'],
            'Distance': distance,
        })

        if enable_deduplication:
            used_indices.add(first_candidate_idx)
            
    return pd.DataFrame(matches), total_time

def main():
    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df, total_time = quick_match(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=True
    )

    matches_df.to_csv("quick_match.csv", index=False)
    print(matches_df.head(20))
    print(f"Total time: {total_time} seconds")

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            3   2020-06-29            938      2022-02-15  0.858506
1            5   2022-07-13          14945      2021-03-03  1.613660
2            5   2024-08-24          17094      2024-09-29  0.731714
3            6   2023-01-24           6989      2021-08-11  1.026536
4            6   2023-07-09           4511      2025-02-28  0.913302
5            6   2024-03-16           3199      2024-03-20  0.717037
6            6   2025-08-17          11904      2022-05-08  0.630327
7            8   2021-02-17           4297      2020-04-22  1.095101
8            8   2021-12-01          18283      2024-10-29  1.038392
9            8   2022-09-13           1896      2020-11-05  0.804035
10           8   2023-03-09          10247      2023-08-29  0.421928
11           8   2023-10-06           2199      2023-08-07  0.687302
12           8   2024-08-28           6343      2025-12-20  0.765744
13           8   2025-02-17       

In [23]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("male_disease.csv")
df_no_disease = pd.read_csv("male_no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df_clean = df.dropna(subset=features).reset_index(drop=True)
    return df_clean

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv


def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.3, 0.1, 0.2, 0.2])
    output_path = "male_exact_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=False
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            5   2022-07-13          10557      2023-11-30  0.087765
1            5   2024-08-24          13310      2025-02-13  0.016314
2            8   2021-02-17          11248      2020-11-19  0.029549
3            8   2021-12-01          10128      2024-09-06  0.030839
4            8   2022-09-13          19581      2025-04-24  0.027471
5            8   2023-03-09          15997      2022-09-05  0.076631
6            8   2023-10-06          11518      2024-12-18  0.044540
7            8   2024-08-28           3317      2021-05-23  0.045626
8            8   2025-02-17           7062      2023-01-14  0.051697
9            8   2025-04-27          16606      2025-11-23  0.079455
10          10   2021-10-28           8685      2020-10-05  0.040774
11          12   2020-05-08           7573      2022-11-26  0.031477
12          12   2022-07-26          15063      2021-11-01  0.071587
13          12   2024-07-09       

In [24]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("female_disease.csv")
df_no_disease = pd.read_csv("female_no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df_clean = df.dropna(subset=features).reset_index(drop=True)
    return df_clean

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv


def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.3, 0.1, 0.2, 0.2])
    output_path = "female_exact_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=False
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            3   2020-06-29           9309      2025-04-07  0.075228
1            6   2023-01-24          10772      2022-11-06  0.048375
2            6   2023-07-09          17858      2025-02-12  0.062506
3            6   2024-03-16            353      2025-07-29  0.069464
4            6   2025-08-17          10897      2025-12-25  0.054152
5            9   2020-01-14          12393      2022-07-18  0.032949
6            9   2021-02-19          14964      2020-01-08  0.019129
7            9   2022-10-11          19284      2021-11-30  0.066531
8           13   2022-11-24          13732      2025-10-30  0.097965
9           15   2020-08-10           9998      2024-10-16  0.151889
10          15   2021-05-23           4705      2021-02-06  0.069934
11          15   2023-02-24          12781      2023-09-28  0.030996
12          15   2024-07-17           6281      2021-11-09  0.028148
13          16   2020-06-20       

In [None]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.optimize import linear_sum_assignment

def preprocess_data(df, features):
    df = df.copy()
    if 'gender' in features:
        df['gender'] = df['gender'].map({'female': 0, 'male': 1})
    return df[features + ['ID', 'date']]

def standardize_features(*dfs, features):
    combined = pd.concat([df[features] for df in dfs])
    mean, std = combined.mean(), combined.std()
    return [(df[features] - mean) / std for df in dfs]

def compute_global_covariance(disease_feat, no_disease_feat, weights):
    combined = pd.concat([disease_feat, no_disease_feat])
    cov = combined.cov().values
    W = np.outer(weights, weights)
    weighted_cov = cov / (W + 1e-8)
    return np.linalg.pinv(weighted_cov)

class DataBlockManager:

    def __init__(self, disease_df, no_disease_df, features, cov_inv):
        self.disease_data = disease_df.reset_index(drop=True)
        self.no_disease_data = no_disease_df.reset_index(drop=True)
        self.disease_allocated = np.zeros(len(disease_df), dtype=bool)
        self.no_disease_allocated = np.zeros(len(no_disease_df), dtype=bool)

        self.disease_nn = NearestNeighbors(
            n_neighbors=1000, 
            metric='mahalanobis',
            metric_params={'VI': cov_inv}
        )
        self.disease_nn.fit(self.disease_data[features].values)
        
        self.no_disease_nn = NearestNeighbors(
            n_neighbors=1000, 
            metric='mahalanobis',
            metric_params={'VI': cov_inv}
        )
        self.no_disease_nn.fit(self.no_disease_data[features].values)
    
    def allocate_block(self, seed, k=50):
# find the same number of disease and no_disease targets
        _, disease_indices = self.disease_nn.kneighbors([seed])
        disease_indices = disease_indices[0]
        disease_selected = []
        for idx in disease_indices:
            if not self.disease_allocated[idx]:
                disease_selected.append(idx)
                if len(disease_selected) == k: break
        if len(disease_selected) < k: return None

        _, no_disease_indices = self.no_disease_nn.kneighbors([seed])
        no_disease_indices = no_disease_indices[0]
        no_disease_selected = []
        for idx in no_disease_indices:
            if not self.no_disease_allocated[idx]:
                no_disease_selected.append(idx)
                if len(no_disease_selected) == k: break
        if len(no_disease_selected) < k: return None

        self.disease_allocated[disease_selected] = True
        self.no_disease_allocated[no_disease_selected] = True
        return (disease_selected, no_disease_selected)

def main():
    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    n_seeds = 1000
    block_size = 100

    df_disease = pd.read_csv("disease.csv")
    df_no_disease = pd.read_csv("no_disease.csv")
    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_feat, no_disease_feat = standardize_features(
        X_disease, X_no_disease, features=features
    )

    cov_inv = compute_global_covariance(
        disease_feat, no_disease_feat, weights
    )
    
    # choose the seeds randomly
    combined_feat = pd.concat([disease_feat, no_disease_feat]).values
    seed_indices = np.random.choice(len(combined_feat), n_seeds, replace=False)
    seeds = combined_feat[seed_indices]
    
    # initialize the data block manager
    manager = DataBlockManager(
        X_disease, X_no_disease, 
        features=features,
        cov_inv=cov_inv
    )
    total_distance = 0.0

    for seed in seeds:
        allocated = manager.allocate_block(seed)
        if allocated is None: continue
        
        # get the sepecialized data
        disease_idx, no_disease_idx = allocated
        block_disease_feat = disease_feat.iloc[disease_idx].values
        block_no_disease_feat = no_disease_feat.iloc[no_disease_idx].values

        total_distance += compute_global_covariance(
            block_disease_feat, block_no_disease_feat, cov_inv
        )
    
    print(f"Total distance: {total_distance:.2f}")

if __name__ == "__main__":
    main()

TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid