In [None]:
import numpy as np
import pandas as pd

import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import iplot

## This notebooks contains simple synthetic data generation pipeline:
First, the data is read from the default location and stored in a `DataFrame`. As all the features are numeric
and distributed quite normally, the basic statistics are collected and used for generation
(new data points are sampled from normal distribution). In the last section, synthetic data points are compared with
the original ones using PCA

__The outputs of this notebook were cleared in order to reduce the size__
Feel free to run it yourself, but do not forget to download the data first (e.g., run the training pipeline)

In [None]:
raw_data_path = "../data/raw/breast-cancer-dataset.csv"

In [None]:
from itertools import chain

FEATURE_TYPES = (
    "radius",
    "texture",
    "peri",
    "area",
    "smoothness",
    "compactness",
    "concavity",
    "concave_points",
    "symmetry",
    "fractal_dim",
)

FEATURES = chain(
    map(lambda x: f"{x}_mean", FEATURE_TYPES),
    map(lambda x: f"{x}_se", FEATURE_TYPES),
    map(lambda x: f"{x}_worst", FEATURE_TYPES),
)

FEATURES = tuple(FEATURES)
DEFAULT_COLUMN_NAMES = ["id", "diag", *FEATURES]


In [None]:
raw_data = pd.read_csv(raw_data_path, names=DEFAULT_COLUMN_NAMES).drop(columns=["id"])

In [None]:
target, features = raw_data.diag, raw_data.drop(columns=["diag"])

In [None]:
features

In [None]:
from dataclasses import dataclass

@dataclass
class Stats:
    mean: np.ndarray
    std: np.ndarray

    def normalize(self, data: np.ndarray) -> np.ndarray:
        return (data - self.mean) / self.std


def extract_statistics(df: pd.DataFrame) -> Stats:
    return Stats(mean=df.to_numpy().mean(axis=0), std=df.to_numpy().std(axis=0))

In [None]:
features_stats = extract_statistics(features)

In [None]:
class NumericFaker:
    def __init__(self, stats: Stats, seed: int = 1) -> None:
        self.stats = stats
        self.rng = np.random.default_rng(seed=seed)

    def generate_points(self, n_points: int) -> np.ndarray:
        fake_features = tuple(
            self.rng.normal(mean, std, size=n_points).T for mean, std
            in zip(self.stats.mean, self.stats.std)
        )

        return np.column_stack(fake_features)

In [None]:
gen = NumericFaker(stats=features_stats, seed=42)
fakes = gen.generate_points(n_points=10)

In [None]:
points = pd.DataFrame(fakes, columns=FEATURES)
points

In [None]:
def compare_distributions(trues: np.ndarray, fakes: np.ndarray):
    frames = (
        (
            go.Histogram(
                x=real, name="Real", hovertext=feature,
                opacity=0.8, histnorm="probability",
                legendgroup="Real", showlegend=False,
                marker=dict(color="crimson")
            ),
            go.Histogram(
                x=fake, name="Fake", hovertext=feature,
                opacity=0.8, histnorm="probability",
                legendgroup="Fake", showlegend=False,
                marker=dict(color="mediumseagreen")
            )
        )
    for real, fake, feature in zip(trues, fakes, FEATURES)
    )

    fig = make_subplots(rows=3, cols=10)
    fig.update_layout(
        width=1500, height=1000,
        title_text="Original vs. Fake data comparison"
    )

    for i, (true_hist, fakes_hist) in enumerate(frames):
        row, col = i // 10 + 1, i % 10 + 1
        fig.append_trace(true_hist, row, col)
        fig.append_trace(fakes_hist, row, col)

    return fig

In [None]:
gen = NumericFaker(stats=features_stats, seed=42)
fig = compare_distributions(features.to_numpy().T, gen.generate_points(n_points=500).T)
iplot(fig)

The figure above demonstrates that generated data is pretty simular to the original. This is possible because the original features have almost normal distribution (if these had, say, uniform or bimodal distribution, such sampling would have not yielded such pretty results).

Importand preprocessing step before applying PCA is normalization (it prevents the transformed data from being skewed due to varying feature scales)
This can be easily done as we already have the required statistics

In [None]:
gen = NumericFaker(stats=features_stats, seed=42)
fakes = gen.generate_points(n_points=400)

original_normed = features_stats.normalize(features.to_numpy())
fake_normed = features_stats.normalize(fakes)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
original_3d = pca.fit_transform(features.to_numpy())
fake_3d = pca.transform(fakes)

In [None]:
def pca_graph(real, fake, title: str = "Original vs. Fake after PCA transform"):
    dfs = (pd.DataFrame(
        {
            "PC1": data[:, 0],
            "PC2": data[:, 1],
            "PC3": data[:, 2],
        }
    ) for data in (real, fake))

    fig = go.Figure()
    fig.update_layout(
        width=1000, height=900,
        title_text=title
    )

    for df, name in zip(dfs, ("Real", "Fake")):
        fig.add_scatter3d(x=df.PC1, y=df.PC2, z=df.PC3, name=name, mode='markers', opacity=0.6)

    return fig


def pca_pipeline(stats: Stats, original: np.ndarray, fake_points: int, seed: int = 1, scale: bool = False) -> go.Figure:
    gen = NumericFaker(stats=stats, seed=seed)
    fakes = gen.generate_points(fake_points)
    pca = PCA(n_components=3)

    if scale:
        original, fakes = map(stats.normalize, (original, fakes))

    original_3d = pca.fit_transform(original)
    fake_3d = pca.transform(fakes)

    return pca_graph(
        original_3d, fake_3d,
        title=f"Original vs. Fake after PCA transform ({'normalized' if scale else 'no scaling'})"
    )

In [None]:
iplot(pca_pipeline(stats=features_stats, original=features.to_numpy(), fake_points=200, seed=42, scale=False))

In [None]:
iplot(pca_pipeline(stats=features_stats, original=features.to_numpy(), fake_points=200, seed=42, scale=True))

Interesting results: when fitted on stardard-scaled data (this said, data with unit std and zero mean),
fake data forms a 3-dimensional gaussoid (I guess?), so it looks like PCA transform does preserve the distribution properties of the
fakes (this sample essentially forms a 30-dimensional gaussoid).
However, when scaling is not performed, the data is skewed and fakes are distinguished easily.

It was a nice Jupyter time and in the following cell dump of generated data is done

In [None]:
DUMP_FILE = "synthetic-features.csv"

pd.DataFrame(gen.generate_points(n_points=50), columns=FEATURES).to_csv(DUMP_FILE, sep=",", index=False)