
# DAMO630 – Assignment 1 (Final Patched)

**Generated:** 2025-10-04 02:01  

- Fixes `yes`/`no` (and similar) targets in **TSTR** by mapping to 0/1.
- Ensures correlation-preservation metric is always a scalar.
- Adds a consolidated **Results Table** + CSV export.
- `%pip install` cells enabled for Colab.


In [None]:

# Install dependencies (run once per session)
%pip install --quiet pandas numpy scipy scikit-learn matplotlib Faker
%pip install --quiet sdv ctgan copulas
%pip install --quiet pyspark


## Part A — Privacy-Preserving Analytics with Synthetic Data

In [None]:

import pandas as pd, numpy as np, matplotlib.pyplot as plt
URL = "https://vincentarelbundock.github.io/Rdatasets/csv/AER/HealthInsurance.csv"
df = pd.read_csv(URL)
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()
print("Shape:", df.shape); print("Numeric:", numeric_cols); print("Categorical:", categorical_cols)
display(df.describe(include='all').transpose())
print("\nMissing values per column:\n", df.isna().sum())


In [None]:

# Quick EDA plots
for col in numeric_cols:
    plt.figure(); df[col].plot(kind='hist', bins=30, edgecolor='k', alpha=0.8)
    plt.title(f"Histogram: {col}"); plt.xlabel(col); plt.ylabel("Count"); plt.show()
for col in categorical_cols:
    plt.figure(); df[col].value_counts().head(20).plot(kind='bar', edgecolor='k', alpha=0.8)
    plt.title(f"Bar chart: {col} (Top 20)"); plt.xlabel(col); plt.ylabel("Count"); plt.show()


In [None]:

# Correlation matrix (numeric only)
if len(numeric_cols) > 1:
    corr = df[numeric_cols].corr(numeric_only=True)
    display(corr)
    plt.figure(figsize=(6,5))
    plt.imshow(corr, cmap='coolwarm', interpolation='nearest')
    plt.title("Correlation matrix (numeric)"); plt.colorbar()
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.tight_layout(); plt.show()
else:
    print("Not enough numeric columns for correlation matrix.")


### Baseline Synthetic (sampling + noise)

In [None]:

from numpy.random import default_rng
rng = default_rng(42)

def baseline_synth(df_real, noise_scale=0.05, n_samples=None):
    if n_samples is None:
        n_samples = len(df_real)
    data = {}
    for col in df_real.columns:
        s = df_real[col].dropna()
        if s.empty:
            data[col] = [None]*n_samples
            continue
        if pd.api.types.is_numeric_dtype(s):
            vals = s.to_numpy()
            choices = rng.choice(vals, size=n_samples, replace=True)
            std = np.std(vals) if np.std(vals) > 0 else 1e-8
            noise = rng.normal(0, std*noise_scale, size=n_samples)
            data[col] = choices + noise
        else:
            vals = s.to_numpy()
            data[col] = rng.choice(vals, size=n_samples, replace=True)
    return pd.DataFrame(data)

synth_base = baseline_synth(df)
print("Baseline synthetic shape:", synth_base.shape)


### SDV Models: CTGAN & GaussianCopula

In [None]:

from sdv.single_table import CTGANSynthesizer, GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(df)

ctgan = CTGANSynthesizer(metadata, epochs=200)
ctgan.fit(df)
synth_ctgan = ctgan.sample(num_rows=len(df))

gauss = GaussianCopulaSynthesizer(metadata)
gauss.fit(df)
synth_gauss = gauss.sample(num_rows=len(df))

print("CTGAN:", synth_ctgan.shape, "GaussianCopula:", synth_gauss.shape)


### Evaluation Helpers (KS, Correlation, TSTR, Privacy)

In [None]:

from scipy.stats import ks_2samp
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
import numpy as _np

def ks_similarity(real, synth, numeric_cols):
    rows = []
    for col in numeric_cols:
        r = real[col].dropna(); s = synth[col].dropna()
        if len(r) > 5 and len(s) > 5:
            stat, p = ks_2samp(r, s)
            rows.append({"feature": col, "KS_stat": float(stat), "pvalue": float(p)})
    return pd.DataFrame(rows).sort_values("feature")

def corr_preservation(real, synth, numeric_cols):
    if len(numeric_cols) < 2:
        return pd.DataFrame({"mean_abs_corr_diff":[None]})
    rc = real[numeric_cols].corr(numeric_only=True).to_numpy()
    sc = synth[numeric_cols].corr(numeric_only=True).to_numpy()
    mask = _np.triu(_np.ones_like(rc, dtype=bool), k=1)
    mae = float(_np.abs(rc[mask] - sc[mask]).mean())
    return pd.DataFrame({"mean_abs_corr_diff":[mae]})

def tstr_utility(real, synth):
    # Pick an existing binary target if present
    target = None
    for col in real.columns:
        vals = real[col].dropna().unique()
        if len(vals) == 2:
            target = col; break

    tmp_real = real.copy(); tmp_synth = synth.copy()

    # If none, create an auto target from a numeric median split (or random if no numerics)
    if target is None:
        numc = real.select_dtypes(include=[np.number]).columns.tolist()
        target = "__auto_binary_target"
        if numc:
            thr = real[numc[0]].median()
            tmp_real[target] = (real[numc[0]] > thr).astype(int)
            tmp_synth[target] = (synth[numc[0]] > thr).astype(int)
        else:
            tmp_real[target] = (_np.random.rand(len(real)) > 0.5).astype(int)
            tmp_synth[target] = (_np.random.rand(len(synth)) > 0.5).astype(int)

    # Map yes/no/true/false/"1"/"0" to integers if needed
    for df_tmp in [tmp_real, tmp_synth]:
        if df_tmp[target].dtype == object:
            df_tmp[target] = df_tmp[target].astype(str).str.lower().map(
                {"yes": 1, "no": 0, "true": 1, "false": 0, "1": 1, "0": 0}
            ).fillna(0).astype(int)

    Xs = tmp_synth.drop(columns=[target], errors='ignore'); ys = tmp_synth[target].astype(int)
    Xr = tmp_real.drop(columns=[target], errors='ignore'); yr = tmp_real[target].astype(int)

    num_cols = Xr.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = [c for c in Xr.columns if c not in num_cols]

    pre = ColumnTransformer([("num", "passthrough", num_cols),
                             ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)])

    clf = Pipeline([("pre", pre), ("lr", LogisticRegression(max_iter=1000))])
    clf.fit(Xs, ys)
    proba = clf.predict_proba(Xr)[:,1]
    preds = (proba >= 0.5).astype(int)
    return {"ACC": float(accuracy_score(yr, preds)), "ROC_AUC": float(roc_auc_score(yr, proba))}

def privacy_row_duplication(real, synth):
    real_tuples = set(map(tuple, real.dropna(axis=1, how="all").itertuples(index=False, name=None)))
    synth_tuples = set(map(tuple, synth.dropna(axis=1, how="all").itertuples(index=False, name=None)))
    return {"exact_row_overlaps": int(len(real_tuples & synth_tuples))}


### Per-model Metrics Preview

In [None]:

for name, s in [("Baseline", synth_base), ("CTGAN", synth_ctgan), ("GaussianCopula", synth_gauss)]:
    print(f"\n=== {name} ===")
    ks = ks_similarity(df, s, numeric_cols); display(ks.head(10))
    corrp = corr_preservation(df, s, numeric_cols)
    try:
        corr_val = float(corrp["mean_abs_corr_diff"].iloc[0])
    except Exception:
        corr_val = None
    print("Correlation preservation (mean abs corr diff):", corr_val)
    t = tstr_utility(df, s); print("TSTR:", t)
    priv = privacy_row_duplication(df, s); print("Privacy:", priv)


### Consolidated Results Table + CSV

In [None]:

rows = []
for name, s in [("Baseline", synth_base), ("CTGAN", synth_ctgan), ("GaussianCopula", synth_gauss)]:
    ksdf = ks_similarity(df, s, numeric_cols)
    ks_mean = float(ksdf["KS_stat"].mean()) if not ksdf.empty else None
    ks_median = float(ksdf["KS_stat"].median()) if not ksdf.empty else None

    corrp = corr_preservation(df, s, numeric_cols)
    try:
        corr_mae = float(corrp["mean_abs_corr_diff"].iloc[0])
    except Exception:
        corr_mae = None

    tstr = tstr_utility(df, s)
    acc = tstr.get("ACC"); auc = tstr.get("ROC_AUC")
    overlap = privacy_row_duplication(df, s).get("exact_row_overlaps")

    rows.append({"Model": name, "KS_mean": ks_mean, "KS_median": ks_median,
                 "Corr_MAE": corr_mae, "TSTR_ACC": acc, "TSTR_ROC_AUC": auc,
                 "Exact_Row_Overlaps": overlap})

results_df = pd.DataFrame(rows).set_index("Model")
display(results_df)
results_path = "Assignment1_results.csv"
results_df.to_csv(results_path, index=True)
print("Saved:", results_path)


## Part B — NYC Taxi Trip Data with PySpark

In [None]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NYC-Taxi-Analysis").getOrCreate()
# Add your Spark I/O and analytics here (FPGrowth, KMeans, etc.) as needed.
