In [None]:
pip install sdv pandas

Collecting sdv
  Downloading sdv-1.24.1-py3-none-any.whl.metadata (14 kB)
Collecting boto3<2.0.0,>=1.28 (from sdv)
  Downloading boto3-1.40.1-py3-none-any.whl.metadata (6.7 kB)
Collecting graphviz>=0.13.2 (from sdv)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Collecting copulas>=0.12.1 (from sdv)
  Downloading copulas-0.12.3-py3-none-any.whl.metadata (9.5 kB)
Collecting ctgan>=0.11.0 (from sdv)
  Downloading ctgan-0.11.0-py3-none-any.whl.metadata (10 kB)
Collecting deepecho>=0.7.0 (from sdv)
  Downloading deepecho-0.7.0-py3-none-any.whl.metadata (10 kB)
Collecting rdt>=1.17.0 (from sdv)
  Downloading rdt-1.17.1-py3-none-any.whl.metadata (10 kB)
Collecting sdmetrics>=0.21.0 (from sdv)
  Downloading sdmetrics-0.22.0-py3-none-any.whl.metadata (9.4 kB)
Collecting platformdirs>=4.0 (from sdv)
  Downloading platformdirs-4.3.8-py3-none-any.whl.metadata (12 kB)
Collecting botocore<2.0.0,>=1.31 (from sdv)
  Downloading botocore-1.40.1-py3-none-any.whl.metadata (5.7 kB)
Collect

: 

In [None]:
import seaborn as sns
import pandas as pd
from ctgan import CTGAN
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Load Titanic dataset
df = sns.load_dataset('titanic')

# Drop irrelevant or high-missing columns
df = df.drop(columns=['deck', 'embark_town', 'alive', 'class', 'who'])

# Drop rows with missing values
df = df.dropna()

# Convert target to binary
df['survived'] = df['survived'].astype('category')

# Identify discrete (categorical) columns
discrete_columns = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Print preprocessed info
print("📊 Cleaned Titanic Dataset Shape:", df.shape)
print("🧩 Discrete Columns:", discrete_columns)

# Train CTGAN model
ctgan = CTGAN(epochs=100)
ctgan.fit(df, discrete_columns=discrete_columns)

# Save CTGAN model
with open("ctgan_titanic_model.pkl", "wb") as f:
    pickle.dump(ctgan, f)
print("✅ CTGAN model saved as 'ctgan_titanic_model.pkl'")

# Generate synthetic data
synthetic_data = ctgan.sample(len(df))

# Encode both datasets for ML
real_encoded = pd.get_dummies(df, drop_first=True)
synthetic_encoded = pd.get_dummies(synthetic_data, drop_first=True)

# Align both encoded dataframes
real_encoded, synthetic_encoded = real_encoded.align(synthetic_encoded, join='inner', axis=1)

# Split into features and label
X_real = real_encoded.drop('survived_1', axis=1)
y_real = real_encoded['survived_1']

X_synth = synthetic_encoded.drop('survived_1', axis=1)
y_synth = synthetic_encoded['survived_1']

# Train classifier on synthetic data, test on real data
clf = RandomForestClassifier()
clf.fit(X_synth, y_synth)
y_pred = clf.predict(X_real)

# Evaluation
print("\n🎯 Evaluation on Real Data (trained on synthetic data):\n")
print(classification_report(y_real, y_pred))


📊 Cleaned Titanic Dataset Shape: (712, 10)
🧩 Discrete Columns: ['survived', 'sex', 'embarked']


In [None]:
import pickle

# Load the saved CTGAN model
with open("ctgan_titanic_model.pkl", "rb") as f:
    ctgan = pickle.load(f)

# Generate synthetic data (e.g., 500 rows)
synthetic_data = ctgan.sample(500)

# Display the synthetic data
print("🧪 Synthetic Titanic Data:")
print(synthetic_data)

# save to CSV
synthetic_data.to_csv("synthetic_titanic_data.csv", index=False)
print("✅ Synthetic data saved to 'synthetic_titanic_data.csv'")


🧪 Synthetic Titanic Data:
    survived  pclass     sex        age  sibsp  parch        fare embarked  \
0          1       1    male  16.031350      1      2   21.592631        S   
1          0       2  female  13.741602      0      1   -2.302188        Q   
2          0       2    male  22.798399      1      0   99.394238        S   
3          0       3    male  11.235020      1      0   25.030481        S   
4          1       2  female   7.190223      1      1   21.340302        S   
..       ...     ...     ...        ...    ...    ...         ...      ...   
495        0       2    male  17.519042      1      2   15.143950        S   
496        0       1  female   8.046962      1      1    6.965926        S   
497        0       2    male  13.977467      0      1  236.560399        C   
498        0       3    male  21.370587      0      1   72.526781        C   
499        1       2    male  20.598925      3      1   84.969059        S   

     adult_male  alone  
0          T