In [16]:
pip install pgmpy pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


## Discretizing

In [17]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
import time

generate_start_time = time.time()

# Load the dataset
df = pd.read_csv("/Users/arshiailaty/Documents/SDSU/Research Paper/Evaluation_Framework_Paper/Data/diabetes/diabetes_train.csv")

# Drop missing values (optional)
#df.dropna(inplace=True)

# Optionally discretize continuous variables if needed
# For example, we can discretize age, BMI, HbA1c_level, and blood_glucose_level into 5 bins
# Use KBinsDiscretizer to convert continuous data into bins (e.g., 5 bins)
# Columns to discretize
columns_to_discretize = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Initialize the KBinsDiscretizer (we'll use 5 bins, you can adjust if needed)
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

# Fit and transform the continuous columns
df[columns_to_discretize] = discretizer.fit_transform(df[columns_to_discretize])

# Display the processed dataset
print(df.head())


   gender  age  hypertension  heart_disease smoking_history  bmi  HbA1c_level  \
0    Male  4.0             0              0          former  0.0          0.0   
1    Male  4.0             0              0           never  0.0          2.0   
2  Female  2.0             0              0            ever  0.0          0.0   
3  Female  1.0             0              0     not current  0.0          1.0   
4  Female  3.0             1              0         current  0.0          0.0   

   blood_glucose_level  diabetes  
0                  0.0         0  
1                  1.0         1  
2                  1.0         0  
3                  0.0         0  
4                  0.0         0  




# Structure learning: discover the relationships among variables

## Score-based methods: BIC

In [18]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import HillClimbSearch, BicScore

# Define the structure using a Hill-Climbing algorithm with BIC scoring
hc = HillClimbSearch(df)
best_model = hc.estimate(scoring_method=BicScore(df))

# Print the learned structure (edges)
print(best_model.edges())

  0%|          | 0/1000000 [00:00<?, ?it/s]

[('gender', 'bmi'), ('age', 'bmi'), ('age', 'hypertension'), ('age', 'heart_disease'), ('heart_disease', 'gender'), ('heart_disease', 'hypertension'), ('smoking_history', 'age'), ('smoking_history', 'gender'), ('HbA1c_level', 'diabetes'), ('diabetes', 'blood_glucose_level'), ('diabetes', 'age'), ('diabetes', 'smoking_history'), ('diabetes', 'bmi'), ('diabetes', 'hypertension'), ('diabetes', 'heart_disease')]


### After learning the structure, estimate the conditional probability distributions (CPDs) for each node in the network.

In [19]:
from pgmpy.estimators import BayesianEstimator

# Create a Bayesian Network model based on the learned structure
model = BayesianNetwork(best_model.edges())

# Start timing for the entire process
start_time = time.time()

# Fit the CPDs (Conditional Probability Distributions) using the data
model.fit(df, estimator=BayesianEstimator, prior_type='BDeu')  # BDeu prior to avoid overfitting

fit_end_time = time.time() 

# Check the CPDs for each node
for cpd in model.get_cpds():
    print(cpd)


+-----------------+-----+------------------------------+
| heart_disease   | ... | heart_disease(1)             |
+-----------------+-----+------------------------------+
| smoking_history | ... | smoking_history(not current) |
+-----------------+-----+------------------------------+
| gender(Female)  | ... | 0.3536747790289381           |
+-----------------+-----+------------------------------+
| gender(Male)    | ... | 0.6457198208015499           |
+-----------------+-----+------------------------------+
| gender(Other)   | ... | 0.0006054001695120475        |
+-----------------+-----+------------------------------+
+----------+-----------------------+-----+---------------+
| age      | age(0.0)              | ... | age(4.0)      |
+----------+-----------------------+-----+---------------+
| diabetes | diabetes(0)           | ... | diabetes(1)   |
+----------+-----------------------+-----+---------------+
| gender   | gender(Female)        | ... | gender(Other) |
+----------+-------

In [20]:
from pgmpy.sampling import BayesianModelSampling

fit_start_time = time.time() 
# Use the fitted model to generate synthetic data
sampler = BayesianModelSampling(model)
# synthetic_data = sampler.forward_sample
synthetic_data = sampler.forward_sample(len(df))  # Generate synthetic data with the same number of records as the original data

generate_end_time = time.time()  # End time for sampling

# Display the synthetic data
print(synthetic_data.head())


  0%|          | 0/9 [00:00<?, ?it/s]



   gender  bmi  age  hypertension  heart_disease smoking_history  HbA1c_level  \
0    Male  1.0  1.0             0              0         No Info          2.0   
1  Female  1.0  1.0             0              0           never          2.0   
2  Female  1.0  3.0             0              0           never          2.0   
3    Male  0.0  4.0             1              0           never          2.0   
4  Female  1.0  4.0             0              0         No Info          2.0   

   diabetes  blood_glucose_level  
0         0                  1.0  
1         0                  1.0  
2         0                  1.0  
3         0                  0.0  
4         0                  2.0  


In [21]:
synthetic_data.to_csv("diabetes_synthetic_BN_sep12.csv", index=False)

In [22]:
# Calculate the total running time
end_time = time.time()
total_time = end_time - start_time
fit_time = fit_end_time - fit_start_time
generation_time = generate_end_time - generate_start_time

# Display timing results
print(f"Total running time: {total_time} seconds")
print(f"Fitting time: {fit_time} seconds")
print(f"Generation time: {generation_time} seconds")

Total running time: 0.54305100440979 seconds
Fitting time: -0.013506174087524414 seconds
Generation time: 1.1931531429290771 seconds


## Evaluation of Synthetic Data

In [23]:
from scipy.stats import ks_2samp

# Evaluate continuous variables (e.g., age) using KS test
ks_stat, p_value = ks_2samp(df['age'], synthetic_data['age'])
print(f"KS Statistic: {ks_stat}, P-Value: {p_value}")

# Optionally use Chi-square for categorical data


KS Statistic: 0.0018624999999999892, P-Value: 0.9990389339376528


In [25]:
X_test = pd.read_csv("/Users/arshiailaty/Documents/SDSU/Research Paper/Evaluation_Framework_Paper/Data/diabetes/diabetes_test.csv")

In [27]:
# Ensure the synthetic_data has the same length as X_test
print(df.shape,synthetic_data.shape)

(80000, 9) (80000, 9)


In [28]:
synthetic_data_discretized = synthetic_data.copy()

## Inverse_Discretize

In [29]:
def map_bin_to_value(data, bins):
    bin_midpoints = [(bins[i] + bins[i+1]) / 2 for i in range(len(bins) - 1)]
    return [bin_midpoints[int(val)] if not np.isnan(val) else val for val in data]

# Convert each column back to continuous form using the bin ranges
synthetic_data_discretized['age'] = map_bin_to_value(synthetic_data_discretized['age'], 5)
synthetic_data_discretized['bmi'] = map_bin_to_value(synthetic_data_discretized['bmi'], 5)
synthetic_data_discretized['HbA1c_level'] = map_bin_to_value(synthetic_data_discretized['HbA1c_level'], 5)
synthetic_data_discretized['blood_glucose_level'] = map_bin_to_value(synthetic_data_discretized['blood_glucose_level'], 5)

TypeError: object of type 'int' has no len()

In [None]:
from table_evaluator import TableEvaluator

synthetic_data_discretized = synthetic_data_discretized[:len(X_test)]  # Adjust if needed

# Initialize TableEvaluator for evaluation
table_evaluator = TableEvaluator(X_test, synthetic_data_discretized)

# Perform visual evaluation and other evaluations like quality, similarity, etc.
table_evaluator.visual_evaluation()


In [None]:
# Check for rare events (e.g., rare value counts)
rare_values_real = df['rare_event_column'].value_counts()
rare_values_synthetic = synthetic_data['rare_event_column'].value_counts()

print("Real Data Rare Events:\n", rare_values_real)
print("Synthetic Data Rare Events:\n", rare_values_synthetic)


KeyError: 'rare_event_column'