<a href="https://colab.research.google.com/github/maribelhandy/firstpython.py/blob/master/Copy_of_Synthetic_Data_Generation_for_Healthcare_IT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install matplotlib-venn



In [None]:
!apt-get -qq install -y libfluidsynth1

^C


In [None]:
# https://pypi.python.org/pypi/libarchive
!apt-get -qq install -y libarchive-dev && pip install -U libarchive
import libarchive

^C


In [None]:
# https://pypi.python.org/pypi/pydot
!apt-get -qq install -y graphviz && pip install pydot
import pydot

In [None]:
!pip install cartopy
import cartopy

In [None]:


# dissertation_example.py
#
# A Python script to demonstrate the core methodology for the dissertation topic:
# "Synthetic Data Generation for Training and Testing IT Systems in Data-Scarce Environments"
#
# This script covers:
# 1. Simulating a 'real' patient dataset.
# 2. Training a Conditional Tabular GAN (CTGAN) model on this data.
# 3. Generating a synthetic dataset from the trained model.
# 4. Evaluating the synthetic data using two key methods:
#    a) Statistical Fidelity Comparison.
#    b) Machine Learning Utility (the "Train-Synthetic-Test-Real" approach).

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sdv.tabular import CTGAN
from sdv.evaluation import evaluate

# --- STAGE 1: Simulate a 'Real' Patient Dataset ---
# In a real dissertation, you would use a real, sensitive dataset here.
##########################################################################
# For this example, we generate a realistic-looking but entirely fake dataset
#########################################################################
# to avoid any privacy concerns.
#########################################################################
print("Step 1: Simulating a 'real' patient dataset...")

# Define the structure and characteristics of our fake data
num_samples = 2000
data = {
    'age': np.random.normal(loc=55, scale=15, size=num_samples).astype(int),
    'gender': np.random.choice(['Male', 'Female'], size=num_samples, p=[0.48, 0.52]),
    'blood_pressure': np.random.normal(loc=130, scale=20, size=num_samples).astype(int),
    'cholesterol': np.random.normal(loc=200, scale=30, size=num_samples).astype(int),
    'disease_severity': np.random.choice(['Low', 'Medium', 'High'], size=num_samples, p=[0.6, 0.3, 0.1]),
    # The target variable we want to predict
    'had_complication': np.zeros(num_samples, dtype=int)
}
real_data = pd.DataFrame(data)

# Create a plausible relationship for the target variable 'had_complication'
# Complications are more likely for older patients with high blood pressure/cholesterol
real_data.loc[(real_data['age'] > 65) & (real_data['blood_pressure'] > 150), 'had_complication'] = 1
real_data.loc[real_data['cholesterol'] > 220, 'had_complication'] = np.random.choice([0, 1], size=len(real_data[real_data['cholesterol'] > 220]), p=[0.7, 0.3])
real_data['age'] = real_data['age'].clip(18, 90) # Clip age to be realistic

print("Real dataset created with {} records.\n".format(num_samples))
print("Real Data Head:")
print(real_data.head())
print("\n" + "="*50 + "\n")


# --- STAGE 2: Train the Generative Model (CTGAN) ---
# This step addresses the research question: "Which generative models are most effective?"
# We use CTGAN, a well-regarded model for tabular data.
print("Step 2: Training the CTGAN model on the real data...")
# Note: In a real project, training can take a significant amount of time.
# We'll use a small number of epochs for this demonstration.
model = CTGAN(epochs=100, verbose=True) # Increase epochs for better quality
model.fit(real_data)
print("CTGAN model training complete.\n")
print("="*50 + "\n")


# --- STAGE 3: Generate Synthetic Data ---
# Use the trained model to create a new dataset from scratch.
# This new data should capture the statistical properties of the real data
# without being a one-to-one copy.
print("Step 3: Generating synthetic data...")
synthetic_data = model.sample(num_rows=num_samples)
print("Synthetic dataset generated with {} records.\n".format(num_samples))
print("Synthetic Data Head:")
print(synthetic_data.head())
print("\n" + "="*50 + "\n")


# --- STAGE 4: Evaluate the Synthetic Data ---
# This is the most critical stage for the dissertation, addressing the questions of
# "High-Fidelity Generation" and "Utility & Task Performance".

# --- 4a. Statistical Fidelity Evaluation ---
print("Step 4a: Evaluating Statistical Fidelity...")
# The SDV library provides a comprehensive evaluation function that compares
# the distributions and correlations of columns between the real and synthetic data.
# It provides a score from 0 to 1, where 1 is perfect.
quality_report = evaluate(synthetic_data, real_data, aggregate=False)
print("--- Synthetic Data Quality Report ---")
print(quality_report)
print("\nInterpreting the report:")
print("- Column Shapes: How similar are the distributions of individual columns? (Higher is better)")
print("- Column Pair Trends: How well does the synthetic data capture correlations between columns? (Higher is better)\n")
print("="*50 + "\n")


# --- 4b. Machine Learning Utility Evaluation (Train-Synthetic-Test-Real) ---
print("Step 4b: Evaluating Machine Learning Utility (Train-Synthetic-Test-Real)...")

# Define features (X) and the target variable (y)
features = ['age', 'gender', 'blood_pressure', 'cholesterol', 'disease_severity']
target = 'had_complication'

# Pre-process data: One-hot encode categorical variables for the model
real_data_processed = pd.get_dummies(real_data, columns=['gender', 'disease_severity'], drop_first=True)
synthetic_data_processed = pd.get_dummies(synthetic_data, columns=['gender', 'disease_severity'], drop_first=True)

# Align columns to ensure both dataframes have the same features after encoding
real_labels = real_data_processed[target]
real_features = real_data_processed.drop(columns=[target])

synthetic_labels = synthetic_data_processed[target]
synthetic_features = synthetic_data_processed.drop(columns=[target])

# Ensure columns are in the same order
common_cols = list(set(real_features.columns) & set(synthetic_features.columns))
real_features = real_features[common_cols]
synthetic_features = synthetic_features[common_cols]

# Split the REAL data into a training set and a held-out test set
X_train_real, X_test_real, y_train_real, y_test_real = train_test_split(
    real_features, real_labels, test_size=0.3, random_state=42, stratify=real_labels
)

# --- Model 1: Trained and Tested on REAL data (The Benchmark) ---
print("\n--- Training Benchmark Model (Real Data) ---")
benchmark_model = LogisticRegression(max_iter=1000, random_state=42)
benchmark_model.fit(X_train_real, y_train_real)
y_pred_real = benchmark_model.predict(X_test_real)

benchmark_accuracy = accuracy_score(y_test_real, y_pred_real)
benchmark_f1 = f1_score(y_test_real, y_pred_real)

print(f"Benchmark Model Accuracy: {benchmark_accuracy:.4f}")
print(f"Benchmark Model F1-Score: {benchmark_f1:.4f}")
print("Classification Report (Benchmark):")
print(classification_report(y_test_real, y_pred_real))


# --- Model 2: Trained on SYNTHETIC data, Tested on REAL data ---
print("\n--- Training Synthetic Model ---")
synthetic_model = LogisticRegression(max_iter=1000, random_state=42)
# Train on the full set of synthetic features and labels
synthetic_model.fit(synthetic_features, synthetic_labels)

# Evaluate on the SAME held-out test set from the REAL data
y_pred_synthetic = synthetic_model.predict(X_test_real)

synthetic_accuracy = accuracy_score(y_test_real, y_pred_synthetic)
synthetic_f1 = f1_score(y_test_real, y_pred_synthetic)

print(f"Synthetic Model Accuracy: {synthetic_accuracy:.4f}")
print(f"Synthetic Model F1-Score: {synthetic_f1:.4f}")
print("Classification Report (Synthetic):")
print(classification_report(y_test_real, y_pred_synthetic))


# --- Final Comparison ---
print("\n--- Dissertation Experiment Conclusion ---")
print(f"Performance of Benchmark Model (trained on real data): {benchmark_f1:.4f} F1-Score")
print(f"Performance of Synthetic Model (trained on synthetic data): {synthetic_f1:.4f} F1-Score")

performance_diff = ((benchmark_f1 - synthetic_f1) / benchmark_f1) * 100
print(f"\nThe synthetic model achieved performance within {performance_diff:.2f}% of the benchmark model.")
print("This result indicates that the synthetic data has high machine learning utility.")

In [None]:
!pip install sdv --upgrade --no-cache-dir