In [83]:
import pandas as pd
import numpy as np
import time

# generate the medical diagnostic data
start_time = time.perf_counter()
def generate_health_dataset(
    # generate the scale of the data set
    num_rows=50, 
    num_unique_ids=25,
    output_file='health_data.csv'):
    
    # generate gender and attributes based on gender
    gender = np.random.choice(['male', 'female'], num_unique_ids)
    params = {
        'male': {
            'blood_pressure': (140, 8, 1),
            'height': (1.70, 0.05, 2),
            'weight': (75, 6, 1)
        },
        'female': {
            'blood_pressure': (110, 7, 1),
            'height': (1.60, 0.05, 2),
            'weight': (55, 5, 1)
        }
    }

    # generate random number of visits for one same patient
    base_ids = np.arange(1, num_unique_ids + 1)
    repeat_counts = np.random.multinomial(num_rows, [1/num_unique_ids]*num_unique_ids)
    ids = np.repeat(base_ids, repeat_counts)
    
    def generate_values(gender_arr, param_type):
    # generate data frame of base data
        male_params = params['male'][param_type]
        female_params = params['female'][param_type]
        
        return np.where(
            gender_arr == 'male',
            np.random.normal(male_params[0], male_params[1], num_unique_ids).round(male_params[2]),
            np.random.normal(female_params[0], female_params[1], num_unique_ids).round(female_params[2])
        )

    base_data = pd.DataFrame({
        'ID': base_ids,
        'gender': gender,
        'base_age': np.random.randint(18, 80, num_unique_ids),
        'base_blood_pressure': generate_values(gender, 'blood_pressure'),
        'base_height': generate_values(gender, 'height'),
        'base_weight': generate_values(gender, 'weight'),
        'base_disease': np.random.randint(0, 2, num_unique_ids).astype(int)
    })
    df = pd.DataFrame({'ID': ids}).merge(base_data, on='ID', how='left')

    # generate random data from the same patient and attributes' changes based on the date
    date_rng = pd.date_range(start='2020-01-01', end='2025-12-31')
    df['date'] = np.concatenate([
        np.sort(np.random.choice(date_rng, size=count, replace=False))
        for count in np.bincount(ids) if count > 0
    ])

    df['age'] = df['base_age'] + (df['date'].dt.year - 2020)

    noise_config = {
        'blood_pressure': (0, 3, 1),
        'height': (0, 0.05, 2),
        'weight': (0, 1.5, 1)
    }
    
    for column in ['blood_pressure', 'height', 'weight']:
        base_column = f'base_{column}'
        average, standard_deviation, decimals = noise_config[column]
        df[column] = (df[base_column] + np.random.normal(average, standard_deviation, len(df))).round(decimals)

    # disease reversal possibility
    df['disease'] = np.where(
        np.random.rand(len(df)) < 0.2,
        1 - df['base_disease'],
        df['base_disease']
    )

    # delete the base fields and sort by ID and date
    df = df.drop(columns=['base_age', 'base_blood_pressure', 
                     'base_height', 'base_weight', 'base_disease'])
    
    # generate 5% clearly wrong data in specific attributes
    error_rules = {
        'blood_pressure': lambda size: np.where(
            np.random.rand(size) < 0.5,
            np.random.randint(300, 500, size),
            np.random.randint(0, 50, size)
        ),
        'height': lambda size: np.random.choice([-1.0, 4.5], size=size),
        'weight': lambda size: np.random.choice([-10, 800], size=size),
        'disease': lambda size: np.random.choice([-1, 2], size=size),
    }

    target_cols = df.columns.difference(['ID', 'gender', 'date', 'age'])

    for col in target_cols:
        mask = np.random.rand(len(df)) < 0.05
        num_errors = mask.sum()
        errors = error_rules[col](num_errors)
        df.loc[mask, col] = errors
    
    # generate 5% random data missing
    cols_to_missing = df.columns.difference(['ID', 'date'])
    mask = np.random.rand(*df[cols_to_missing].shape) < 0.05
    df[cols_to_missing] = df[cols_to_missing].mask(mask)

    # calculate the BMI
    df['BMI'] = (df.weight / (df.height ** 2)).round(1)

    # sort by ID and date
    df = df.sort_values(['ID', 'date']).reset_index(drop=True)

    # save the generated data set as csv file
    if output_file:
        df.to_csv(
            output_file,
            index=False,
            sep=',',
            encoding='utf-8',
            float_format='%.1f'
        )
    # count time
    elapsed = time.perf_counter() - start_time
    return df, round(elapsed, 5)

if __name__ == "__main__":
    df, run_time = generate_health_dataset()
    print(f"total {run_time} seconds")
    print(df.head(20))

total 0.00862 seconds
    ID  gender       date   age  blood_pressure  height  weight  disease   BMI
0    1  female 2022-11-15  25.0           100.7    1.51    44.6      0.0  19.6
1    2    male 2020-09-28  39.0           133.2    1.75    75.6      0.0  24.7
2    3  female 2020-05-30  67.0           109.0    1.50    41.0      1.0  18.2
3    3  female 2024-05-23  71.0           104.8    1.56    42.4      1.0  17.4
4    3  female 2025-04-21  72.0           110.5    1.50    40.8      1.0  18.1
5    4    male 2022-09-30  75.0           126.4    1.69    77.1      1.0  27.0
6    4    male 2024-10-03  77.0           120.9    1.63    77.4      NaN  29.1
7    5  female 2023-04-23  21.0           121.0     NaN    60.0      0.0   NaN
8    5  female 2025-05-15  23.0           117.8    1.51    54.6      0.0  23.9
9    6  female 2020-04-01  66.0            93.6    1.66    52.9      0.0  19.2
10   6  female 2024-06-29  70.0            98.7    1.74    50.2     -1.0  16.6
11   6  female 2025-05-07  71.

In [84]:
# correct the missing age and gender according to the same ID and date
import pandas as pd
from datetime import datetime
import numpy as np
import time

def correct_age_gender(df):
# correct the gender
    df["gender"] = df.groupby("ID")["gender"].transform(
    lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x
)
    df = df.groupby("ID").filter(
    lambda group: ~group["gender"].isna().all()
)
# correct the age
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    grouped = df.groupby('ID')
    
    processed_dfs = []
    
    for id, group in grouped:
        # group by id and sort by date and age
        group = group.sort_values('year').reset_index(drop=True)
        ages = group['age'].copy()
        years = group['year']

        for i in range(len(group)):
            # scan every empty space of age, call functions to get the previous and next age of the selected one
            if pd.isna(ages[i]):
                prev_age, prev_year = find_previous_value(ages, years, i)
                next_age, next_year = find_next_value(ages, years, i)
                
                # different cases of nearest valid information
                if prev_age is not None and next_age is not None:
                    year_diff = next_year - prev_year
                    if year_diff == 0:
                        if prev_age == next_age:
                            ages[i] = prev_age
                        else:
                            ages[i] = (prev_age + next_age) // 2
                    else:
                        exact_age = prev_age + ((years[i] - prev_year) / year_diff) * (next_age - prev_age)
                        ages[i] = int(round(exact_age))
                elif prev_age is not None:
                    ages[i] = prev_age + (years[i] - prev_year)
                elif next_age is not None:
                    ages[i] = next_age - (next_year - years[i])
                    
        # sort to find the first valid age
        first_valid_idx = ages.first_valid_index()
        if first_valid_idx is not None and first_valid_idx > 0:
            first_valid_age = ages[first_valid_idx]
            first_valid_year = years[first_valid_idx]
            # reverse loop to get the former id without age
            for i in range(first_valid_idx-1, -1, -1):
                ages[i] = first_valid_age - (first_valid_year - years[i])

        # sort to find the last valid age
        last_valid_idx = ages.last_valid_index()
        if last_valid_idx is not None and last_valid_idx < len(ages)-1:
            last_valid_age = ages[last_valid_idx]
            last_valid_year = years[last_valid_idx]
            # loop to get the last id without age if needed
            for i in range(last_valid_idx+1, len(ages)):
                ages[i] = last_valid_age + (years[i] - last_valid_year)
        
        group['age'] = ages.astype('Int64')
        processed_dfs.append(group)
    
    final_df = pd.concat(processed_dfs)
    df = final_df.drop(columns=["year"])
    return df

# find the nearest date and age information in the same IDs
def find_previous_value(ages, years, current_idx):
    for i in range(current_idx-1, -1, -1):
        if not pd.isna(ages[i]):
            return ages[i], years[i]
    return None, None

def find_next_value(ages, years, current_idx):
    for i in range(current_idx+1, len(ages)):
        if not pd.isna(ages[i]):
            return ages[i], years[i]
    return None, None

if __name__ == '__main__':
    total_start = time.perf_counter()
    df = pd.read_csv("health_data.csv")
    processed_df = correct_age_gender(df)
    processed_df.to_csv("processed_health_data.csv", index=False)
    total_end = time.perf_counter()
    total_run_time = total_end - total_start
    print(f"Total run time: {total_run_time:.5f} seconds")
    print(processed_df.head(20))

Total run time: 0.02413 seconds
   ID  gender       date  age  blood_pressure  height  weight  disease   BMI
0   1  female 2022-11-15   25           100.7     1.5    44.6      0.0  19.6
0   2    male 2020-09-28   39           133.2     1.8    75.6      0.0  24.7
0   3  female 2020-05-30   67           109.0     1.5    41.0      1.0  18.2
1   3  female 2024-05-23   71           104.8     1.6    42.4      1.0  17.4
2   3  female 2025-04-21   72           110.5     1.5    40.8      1.0  18.1
0   4    male 2022-09-30   75           126.4     1.7    77.1      1.0  27.0
1   4    male 2024-10-03   77           120.9     1.6    77.4      NaN  29.1
0   5  female 2023-04-23   21           121.0     NaN    60.0      0.0   NaN
1   5  female 2025-05-15   23           117.8     1.5    54.6      0.0  23.9
0   6  female 2020-04-01   66            93.6     1.7    52.9      0.0  19.2
1   6  female 2024-06-29   70            98.7     1.7    50.2     -1.0  16.6
2   6  female 2025-05-07   71            95.

In [85]:
# restrict all the dataset in the correct range in order to delete rows contained wrong and missing data
import pandas as pd

df = pd.read_csv("processed_health_data.csv")
valid_conditions = (
    df["age"].between(10, 100)
    & df["height"].between(0.5, 2.5)
    & df["weight"].between(20, 300)
    & (df["blood_pressure"].between(60, 200))
    & df["disease"].isin([0, 1])
    & df["gender"].isin(["male", "female"])
)

cleaned_df = df[valid_conditions]

cleaned_df.to_csv("cleaned_health_data.csv", index=False)
print(cleaned_df.head(20))

    ID  gender        date  age  blood_pressure  height  weight  disease   BMI
0    1  female  2022-11-15   25           100.7     1.5    44.6      0.0  19.6
1    2    male  2020-09-28   39           133.2     1.8    75.6      0.0  24.7
2    3  female  2020-05-30   67           109.0     1.5    41.0      1.0  18.2
3    3  female  2024-05-23   71           104.8     1.6    42.4      1.0  17.4
4    3  female  2025-04-21   72           110.5     1.5    40.8      1.0  18.1
5    4    male  2022-09-30   75           126.4     1.7    77.1      1.0  27.0
8    5  female  2025-05-15   23           117.8     1.5    54.6      0.0  23.9
9    6  female  2020-04-01   66            93.6     1.7    52.9      0.0  19.2
11   6  female  2025-05-07   71            95.2     1.7    50.6      0.0  16.9
12   7    male  2020-02-10   53           150.7     1.7    77.2      0.0  25.8
13   7    male  2021-03-08   54           146.0     1.7    80.1      1.0  27.4
17   9    male  2022-07-11   30           153.6     

In [86]:
import pandas as pd
def split_and_save_data(
    file_path: str,
    target_column: str,
    split_conditions: dict,
) -> None:
    
    df = pd.read_csv(file_path)
    df_cleaned = df.copy()

    for filename, condition_func in split_conditions.items():
        mask = condition_func(df_cleaned[target_column])
        split_df = df_cleaned[mask]
        split_df.to_csv(filename, index=False)

# split the data set by disease condition
disease_conditions = {
    "no_disease.csv": lambda x: x == 0,
    "disease.csv": lambda x: x == 1
}

split_and_save_data(
    file_path = "cleaned_health_data.csv",
    target_column = df.columns[7],
    split_conditions = disease_conditions,
)

# split the disease and no_disease data set by gender
split_and_save_data(
    file_path="disease.csv",
    target_column = df.columns[1],
    split_conditions={
        "male_disease.csv": lambda x: x == "male",
        "female_disease.csv": lambda x: x == "female"
    }
)

split_and_save_data(
    file_path="no_disease.csv",
    target_column = df.columns[1],
    split_conditions={
        "male_no_disease.csv": lambda x: x == "male",
        "female_no_disease.csv": lambda x: x == "female"
    }
)

In [87]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("disease.csv")
df_no_disease = pd.read_csv("no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df['gender'] = df['gender'].map({'female': 0, 'male': 1})
    return df[features + ['ID', 'date']]

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv

In [9]:
def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    output_path = "exact_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=True
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

    total_distance = matches_df['Distance'].sum()
    print("Total distance:", total_distance)

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            3   2020-05-30             19      2020-04-24  0.467316
1            3   2024-05-23              6      2025-05-07  0.498367
2            3   2025-04-21             19      2020-08-18  0.645619
3            4   2022-09-30             19      2020-10-17  0.577036
4            7   2021-03-08             22      2020-11-18  0.211340
5           10   2020-06-19             19      2020-12-25  0.416014
6           13   2025-05-31              5      2025-05-15  0.281334
7           17   2021-01-28             21      2025-06-17  0.536621
8           18   2023-01-01             20      2023-03-02  0.530879
9           18   2025-04-12             25      2020-04-20  0.531043
10          21   2024-02-01              2      2020-09-28  0.234399
Total distance: 4.92996838323383


In [89]:
import time
def quick_match(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    total_time = 0.0

    for i in range(len(disease_data)):

        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        start_time = time.perf_counter()

        candidates = no_disease_data[mask]
        if candidates.empty:
            continue
        
        end_time = time.perf_counter()
        elapsed_time = end_time - start_time
        total_time += elapsed_time

# choose the matched target randomly
        candidates = candidates.sample(frac=1)
        first_candidate_idx = candidates.index[0]
        
        candidate_features = no_disease_features.loc[first_candidate_idx].values.reshape(1, -1)
        distance = cdist(
            current_features, 
            candidate_features, 
            metric='mahalanobis', 
            VI=cov_inv
        )[0][0]

        matches.append({
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[first_candidate_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[first_candidate_idx, 'date'],
            'Distance': distance,
        })

        if enable_deduplication:
            used_indices.add(first_candidate_idx)

            
    return pd.DataFrame(matches), total_time

def main():
    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df, total_time = quick_match(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=True
    )

    matches_df.to_csv("quick_match.csv", index=False)
    print(matches_df.head(20))
    print(f"Matches saved. Total time: {total_time} seconds")

if __name__ == "__main__":
    main()

    Disease ID Disease Date  no Disease ID no Disease Date  Distance
0            3   2020-05-30              5      2025-05-15  0.990307
1            3   2024-05-23              9      2025-03-24  2.345014
2            3   2025-04-21              9      2022-07-11  1.951586
3            4   2022-09-30              7      2020-02-10  1.017205
4            7   2021-03-08             19      2020-12-25  1.315722
5           10   2020-06-19             22      2025-07-19  1.748595
6           13   2025-05-31             20      2023-03-02  0.586361
7           17   2021-01-28             19      2020-08-18  0.493180
8           18   2023-01-01             19      2020-10-17  0.530515
9           18   2025-04-12              6      2025-05-07  1.054780
10          21   2024-02-01              6      2020-04-01  1.370403
Matches saved. Total time: 0.0010505999744054861 seconds


In [90]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("male_disease.csv")
df_no_disease = pd.read_csv("male_no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df_clean = df.dropna(subset=features).reset_index(drop=True)
    return df_clean

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv


def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.3, 0.1, 0.2, 0.2])
    output_path = "male_exact_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=False
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

if __name__ == "__main__":
    main()

   Disease ID Disease Date  no Disease ID no Disease Date  Distance
0           4   2022-09-30              2      2020-09-28  0.686888
1           7   2021-03-08             22      2020-11-18  0.331066
2          21   2024-02-01              2      2020-09-28  0.298963


In [91]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

df_disease = pd.read_csv("female_disease.csv")
df_no_disease = pd.read_csv("female_no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df_clean = df.dropna(subset=features).reset_index(drop=True)
    return df_clean

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv


def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv, enable_deduplication=True):
    matches = []
    used_indices = set() if enable_deduplication else None
    
    for i in range(len(disease_data)):
        disease_id = disease_data.iloc[i]['ID']
        disease_date = disease_data.iloc[i]['date']
        current_features = disease_features.iloc[i].values.reshape(1, -1)

        mask = (no_disease_data['ID'] != disease_id)
        if enable_deduplication:
            mask &= (~no_disease_data.index.isin(used_indices))
            
        candidates = no_disease_data[mask]
        if candidates.empty:
            continue

        distances = cdist(
            current_features, 
            no_disease_features[mask], 
            metric='mahalanobis', 
            VI=cov_inv
        )

        min_idx = np.argmin(distances)
        best_match_idx = candidates.index[min_idx]
# record the matched results
        record = {
            'Disease ID': disease_id,
            'Disease Date': disease_date,
            'no Disease ID': no_disease_data.loc[best_match_idx, 'ID'],
            'no Disease Date': no_disease_data.loc[best_match_idx, 'date'],
            'Distance': distances[0, min_idx]
        }
        matches.append(record)

        if enable_deduplication:
            used_indices.add(best_match_idx)
            
    return pd.DataFrame(matches)

def main():

    features = ['age', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.3, 0.1, 0.2, 0.2])
    output_path = "female_exact_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    matches_df = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
        enable_deduplication=False
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

if __name__ == "__main__":
    main()

   Disease ID Disease Date  no Disease ID no Disease Date  Distance
0           3   2020-05-30             19      2020-08-18  0.510398
1           3   2024-05-23             19      2020-08-18  0.460687
2           3   2025-04-21             19      2020-10-17  0.577145
3          10   2020-06-19             19      2020-12-25  0.426859
4          13   2025-05-31              5      2025-05-15  0.318579
5          17   2021-01-28              5      2025-05-15  0.304208
6          18   2023-01-01             19      2020-04-24  0.262519
7          18   2025-04-12             19      2020-04-24  0.400112


In [4]:

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from itertools import permutations
from tqdm import tqdm
import math
import sys
import psutil

df_disease = pd.read_csv("disease.csv")
df_no_disease = pd.read_csv("no_disease.csv")

def preprocess_data(df, features):
    df = df.copy()
    df['gender'] = df['gender'].map({'female': 0, 'male': 1})
    return df[features + ['ID', 'date']]

# standardization
def standardize_features(disease_df, no_disease_df, features):
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()

    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std
    return disease_features, no_disease_features

# compute the weighted covariance matrix
def compute_weighted_covariance(disease_features, no_disease_features, weights):

    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values

    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix

    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv

# 内存监控函数
def get_mem_usage():
    process = psutil.Process()
    return f"{process.memory_info().rss / 1024 ** 2:.1f}MB"

def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv):
    n = len(disease_data)
    matches = []

    # 计算全量距离矩阵
    distance_matrix = cdist(
        disease_features, 
        no_disease_features,
        metric='mahalanobis', 
        VI=cov_inv
    )

    min_total = float('inf')
    best_perm = None
    
    # 安全阈值检查
    MAX_PERMS = 40_000_000  # 最多允许处理1百万次排列
    if math.factorial(n) > MAX_PERMS:
        raise ValueError(f"n={n}的排列数超过安全阈值{MAX_PERMS:,}")
    
    # 进度条配置
    total_perms = math.factorial(n)
    progress_bar = tqdm(
        total=total_perms,
        desc=f"暴力枚举进度 (n={n})",
        unit="perm",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] 内存：{postfix[0]}",
        postfix=[get_mem_usage()]
    )

    try:
        for perm in permutations(range(n)):
            # 计算当前排列的总距离
            total_distance = sum(distance_matrix[i, perm[i]] for i in range(n))
            
            # 保留最优解
            if total_distance < min_total:
                min_total = total_distance
                best_perm = perm
            
            # 每100次更新一次进度条
            if progress_bar.n % 100 == 0:
                progress_bar.set_postfix_str(get_mem_usage())
                progress_bar.update(100)
                
        # 处理最后剩余部分
        if progress_bar.n < total_perms:
            progress_bar.update(total_perms - progress_bar.n)
            
    except KeyboardInterrupt:
        print("\n用户中断！当前最优解总距离：", min_total)
        sys.exit(1)
    finally:
        progress_bar.close()

    # 构建匹配结果
    for i in range(n):
        disease_row = disease_data.iloc[i]
        no_disease_row = no_disease_data.iloc[best_perm[i]]
        
        record = {
            'Disease ID': disease_row['ID'],
            'Disease Date': disease_row['date'],
            'no Disease ID': no_disease_row['ID'],
            'no Disease Date': no_disease_row['date'],
            'Distance': distance_matrix[i, best_perm[i]]
        }
        matches.append(record)
        
    return pd.DataFrame(matches)

def main():

    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    weights = np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    output_path = "optimal_match.csv"

    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )

    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, weights
    )

    
    if len(X_disease) != len(X_no_disease):
        print("Warning: Truncating to smaller size")
        min_len = min(len(X_disease), len(X_no_disease))
        X_disease = X_disease.iloc[:min_len]
        X_no_disease = X_no_disease.iloc[:min_len]

    matches_df = exact_matches(
        disease_data=X_disease,
        no_disease_data=X_no_disease,
        disease_features=disease_features,
        no_disease_features=no_disease_features,
        cov_inv=cov_inv,
    )

    matches_df.to_csv(output_path, index=False)
    print(matches_df.head(20))

    total_distance = matches_df['Distance'].sum()
    print("Total distance:", total_distance)

if __name__ == "__main__":
    main()



暴力枚举进度 (n=11):   0%|          | 0/39916800 [00:00<?] 内存：,      

暴力枚举进度 (n=11): |          | 40478900/? [1:23:30<00:00] 内存：,           


用户中断！当前最优解总距离： 7.389363006955237





SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [3]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
from itertools import permutations
from tqdm import tqdm
import math
import sys
import time  # 新增时间模块
import psutil

# --------------------------
# 数据预处理函数（保持不变）
# --------------------------
def preprocess_data(df, features):
    df = df.copy()
    df['gender'] = df['gender'].map({'female': 0, 'male': 1})
    return df[features + ['ID', 'date']]

def standardize_features(disease_df, no_disease_df, features):
    disease_idx = disease_df.index
    no_disease_idx = no_disease_df.index
    
    combined = pd.concat([disease_df[features], no_disease_df[features]])
    mean = combined.mean()
    std = combined.std()
    disease_features = (disease_df[features] - mean) / std
    no_disease_features = (no_disease_df[features] - mean) / std

    disease_features.index = disease_idx
    no_disease_features.index = no_disease_idx
    return disease_features, no_disease_features

def compute_weighted_covariance(disease_features, no_disease_features, weights):
    combined = pd.concat([disease_features, no_disease_features])
    cov_matrix = combined.cov().values
    W_matrix = np.outer(weights, weights)
    cov_weighted = cov_matrix / W_matrix
    try:
        cov_inv = np.linalg.inv(cov_weighted)
    except np.linalg.LinAlgError:
        cov_inv = np.linalg.pinv(cov_weighted)
    return cov_inv

# --------------------------
# 暴力枚举匹配算法（优化版）
# --------------------------
def exact_matches(disease_data, no_disease_data, 
                disease_features, no_disease_features, 
                cov_inv):
    n = len(disease_data)
    matches = []

    # 计算全量距离矩阵（优化存储）
    distance_matrix = cdist(
        disease_features.values.astype(np.float32),  # 使用单精度浮点节省内存
        no_disease_features.values.astype(np.float32),
        metric='mahalanobis',
        VI=cov_inv.astype(np.float32)
    )
    no_disease_data = no_disease_data.reset_index(drop=True)
    no_disease_features = no_disease_features.reset_index(drop=True)

    min_total = float('inf')
    best_perm = None
    indices = np.arange(n)  # 预生成索引
    
    # 安全阈值检查
    MAX_PERMS = 10**8
    if math.factorial(n) > MAX_PERMS:
        raise ValueError(f"排列数超过安全阈值 {MAX_PERMS}")

    # 进度条配置
    progress_bar = tqdm(
        permutations(range(n)),
        total=math.factorial(n),
        desc=f"暴力枚举 (n={n})",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}] 内存：{postfix[0]}",
        postfix=[psutil.Process().memory_info().rss // 1024**2]
    )

    start_time = time.time()
    try:
        for perm in progress_bar:
            # 向量化计算总距离
            total_distance = distance_matrix[indices, list(perm)].sum()
            
            if total_distance < min_total:
                min_total = total_distance
                best_perm = perm
            
            # 每100次更新内存和预计时间
            if progress_bar.n % 100 == 0:
                elapsed = time.time() - start_time
                per_perm = elapsed / (progress_bar.n + 1)
                remaining = per_perm * (progress_bar.total - progress_bar.n)
                progress_bar.set_postfix(
                    mem_mb=psutil.Process().memory_info().rss // 1024**2,
                    est_remain=f"{remaining:.1f}s"
                )
                
    except KeyboardInterrupt:
        print(f"\n用户中断！当前最小总距离：{min_total:.2f}")
        sys.exit(1)
    finally:
        progress_bar.close()
    for i in range(n):
        disease_row = disease_data.iloc[i]
        no_disease_row = no_disease_data.iloc[best_perm[i]]
        
        record = {
            'Disease ID': disease_row['ID'],
            'Disease Date': disease_row['date'],
            'no Disease ID': no_disease_row['ID'],
            'no Disease Date': no_disease_row['date'],
            'Distance': distance_matrix[i, best_perm[i]]
        }
        matches.append(record)

    # 构建结果（略，同原代码）
    return pd.DataFrame

# --------------------------
# 主函数（添加安全限制）
# --------------------------
def main():
    # 数据加载
    df_disease = pd.read_csv("disease.csv")
    df_no_disease = pd.read_csv("no_disease.csv")

    # 预处理
    features = ['age', 'gender', 'blood_pressure', 'height', 'weight', 'BMI']
    X_disease = preprocess_data(df_disease, features)
    X_no_disease = preprocess_data(df_no_disease, features)

    # 强制规模限制
    MAX_SAFE_SAMPLES = 15  # 最大允许暴力计算的样本数
    min_len = min(len(X_disease), len(X_no_disease))
    if min_len > MAX_SAFE_SAMPLES:
        print(f"错误：数据规模 {min_len} 超过安全阈值 {MAX_SAFE_SAMPLES}")
        sys.exit(1)
    X_disease = X_disease.iloc[:MAX_SAFE_SAMPLES]
    X_no_disease = X_no_disease.iloc[:MAX_SAFE_SAMPLES]

    # 标准化与协方差计算
    disease_features, no_disease_features = standardize_features(
        X_disease, X_no_disease, features
    )
    cov_inv = compute_weighted_covariance(
        disease_features, no_disease_features, 
        weights=np.array([0.2, 0.1, 0.3, 0.1, 0.1, 0.2])
    )

    # 执行匹配
    matches_df = exact_matches(
        X_disease, X_no_disease, 
        disease_features, no_disease_features, 
        cov_inv
    )
    matches_df.to_csv("brute_force_results.csv", index=False)

if __name__ == "__main__":
    main()

暴力枚举 (n=11): 100%|██████████| 39916800/39916800 [03:45<00:00, 177080.18it/s] 内存：,


AttributeError: 'str' object has no attribute 'to_frame'