In [4]:
import numpy as np
import pandas as pd

# Provided list of continuous columns
continuous_columns = [
    'careplan_length', 'Diastolic Blood Pressure', 'Systolic Blood Pressure',
    'Body Mass Index', 'Total Cholesterol', 'High Density Lipoprotein Cholesterol',
    'Triglycerides', 'Low Density Lipoprotein Cholesterol', 'Glucose',
    'Hemoglobin A1c/Hemoglobin.total in Blood', 'Sodium', 'Chloride',
    'Potassium', 'Carbon Dioxide', 'Calcium', 'Urea Nitrogen',
    'Estimated Glomerular Filtration Rate'
]

def add_laplace_noise_continuous(data, column, epsilon, sensitivity):
    scale = sensitivity / epsilon
    noise = np.random.laplace(0, scale, size=len(data))
    noisy_column = data[column] + noise
    # Ensure values remain non-negative
    noisy_column = np.clip(noisy_column, 0, None)
    return noisy_column

def flip_binary_values(data, column, flip_probability):
    flip_mask = np.random.rand(len(data)) < flip_probability
    # XOR operation to flip the binary values and maintain them as integers
    return data[column].astype(int) ^ flip_mask.astype(int)

def add_noise_to_dataset(data, continuous_cols, feature_importances, base_epsilon=1.0, sensitivity=1.0):
    noisy_data = data.copy()

    # Normalize feature importances to sum to 1 if not already
    total_importance = sum(feature_importances.values())
    normalized_importances = {k: v / total_importance for k, v in feature_importances.items()}

    # Apply noise to continuous columns
    for column in continuous_cols:
        column_epsilon = base_epsilon / (normalized_importances.get(column, 1) + 1e-12)
        noisy_data[column] = add_laplace_noise_continuous(noisy_data, column, column_epsilon, sensitivity)

    # Determine binary columns and apply noise
    binary_columns = [col for col in data.columns if col not in continuous_cols]
    for column in binary_columns:
        column_epsilon = base_epsilon / (normalized_importances.get(column, 1) + 1e-12)
        flip_probability = min(sensitivity / column_epsilon, 1)
        noisy_data[column] = flip_binary_values(noisy_data, column, flip_probability)

    return noisy_data

# Load your dataset
synthetic_data = pd.read_csv('/content/CTGAN_real_train_positive.csv')

# Feature importance placeholder (paste your list here)
feature_importances = {
    "Amoxicillin 250 MG / Clavulanate 125 MG [Augmentin]": 0.6070259934088039,
    "careplan_within_24": 0.20196921117810945,
    "careplan_length": 0.0387404832992168,
    "Diastolic Blood Pressure": 0.01593174528114472,
    "Triglycerides": 0.013285744643897259,
    "Low Density Lipoprotein Cholesterol": 0.011156745043195106,
    "Systolic Blood Pressure": 0.011067519087524449,
    "Total Cholesterol": 0.010531923316692417,
    "Glucose": 0.01042754906238533,
    "encounter_type_emergency room visit": 0.009333681325762986,
    "Body Mass Index": 0.008730519405456768,
    "High Density Lipoprotein Cholesterol": 0.007936415133410847,
    "Throat culture (procedure)": 0.006606263992270559,
    "Hemoglobin A1c/Hemoglobin.total in Blood": 0.006303760356980994,
    "encounter_type_inpatient": 0.005856087719265674,
    "Documentation of current medications": 0.005441269840608777,
    "Chloride": 0.004144733624870049,
    "Urea Nitrogen": 0.003576803137127369,
    "Carbon Dioxide": 0.003048204013348675,
    "Sodium": 0.002693111108425654,
    "Estimated Glomerular Filtration Rate": 0.0023907123321869057,
    "Measurement of respiratory function (procedure)": 0.0016928165228212457,
    "Acetaminophen 160 MG": 0.0013037144637531137,
    "Potassium": 0.0005872619777878974,
    "Pneumococcal conjugate PCV 13": 0.0005684080815809761,
    "Sputum examination (procedure)": 0.0005661014176933233,
    "Naproxen sodium 220 MG Oral Tablet": 0.0005284994623340063,
    "ETHNICITY_chinese": 0.0004971493343409692,
    "ETHNICITY_asian_indian": 0.0004751397604113708,
    "Dextromethorphan Hydrobromide 1 MG/ML": 0.0004611668432281288,
    "ETHNICITY_french": 0.0004422913826401808,
    "ETHNICITY_swedish": 0.00042388640184692103,
    "Calcium": 0.0004238019239992515,
    "ETHNICITY_puerto_rican": 0.00041704896670753937,
    "MARITAL_M": 0.00038786427857969775,
    "ETHNICITY_polish": 0.0003798651072036011,
    "encounter_type_outpatient": 0.0003796169885940584,
    "ETHNICITY_portuguese": 0.0003542206478322841,
    "RACE_black": 0.0003468532065781131,
    "ETHNICITY_central_american": 0.00033266131592813944,
    "RACE_white": 0.00031254024599374787,
    "MARITAL_S": 0.00030616752730520973,
    "ETHNICITY_russian": 0.0002703578673445152,
    "ETHNICITY_mexican": 0.00026747319126676084,
    "ETHNICITY_french_canadian": 0.0002667909952212075,
    "RACE_asian": 0.00025457117005489165,
    "ETHNICITY_german": 0.00022120260498054584,
    "GENDER_F": 0.0002197621675960548,
    "ETHNICITY_irish": 0.00020553964689645247,
    "ETHNICITY_american": 0.00018853587082225104,
    "GENDER_M": 0.0001804390513286463,
    "ETHNICITY_scottish": 0.0001423932783752203,
    "ETHNICITY_west_indian": 0.00011673897235512194,
"Ibuprofen 200 MG Oral Tablet": 0.00010885407904905236,
"ETHNICITY_english": 7.310565550174923e-05,
"ETHNICITY_dominican": 7.095201838538164e-05,
"RACE_hispanic": 2.7731262977638095e-05,
"Spirometry (procedure)": 0.0,
"ETHNICITY_italian": 0.0,
"RACE_native": 0.0,
"varicella": 0.0,
"ETHNICITY_american_indian": 0.0,
"DTaP": 0.0,
"MMR": 0.0,
"ETHNICITY_african": 0.0
}

# Ensure no feature importance is zero to avoid division by zero in epsilon adjustment
feature_importances = {k: (v if v != 0 else 2.7731262977638095e-05) for k, v in feature_importances.items()}

# Apply noise to the dataset
noisy_synthetic_data = add_noise_to_dataset(synthetic_data, continuous_columns, feature_importances)

# Save the noisy dataset
noisy_synthetic_data.to_csv('Synthetic_Data_with_Noise.csv', index=False)
