<a href="https://colab.research.google.com/github/maryzhang1028/project-0/blob/main/tabular_data_augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Why are we doing this?
Real-world datasets are often limited in size, imbalanced across categories, or missing the diversity needed to train robust machine learning models. Data augmentation helps us address these challenges by creating new, realistic samples that preserve the patterns in the original data. This improves generalization, reduces bias, and prepares us to build models that perform well on unseen data.

# What does this notebook do?
In this notebook, we:
- Load a class dataset from Hugging Face.
- Clean and preprocess the survey-style responses.
- Apply several augmentation strategies for tabular data:
  - **Numeric jitter**: Adds small variations to continuous values.
  - **SMOTE**: Synthesizes balanced samples for categorical targets.
  - **Mixup**: Blends pairs of samples for richer variability.
  - **CTGAN**: Uses a generative model to create realistic synthetic rows.
- Combine these augmented samples with the original data into a new dataset ready for downstream tasks.

# Let's dive in!

## Installation and setup

In [None]:
# Install libraries that don't come for free in Google Colab
!pip install datasets imbalanced-learn sdv pandas huggingface_hub --quiet

In [None]:
import random
import typing

import numpy
import pandas
import pandas.api.types

import datasets
import huggingface_hub
import imblearn.over_sampling
import sdv

## Load data

In [None]:
# Load the dataset and take a look at it
HF_DATASET_ID = "ccm/2025-24679-tabular-dataset"

# Load all available splits (we'll prefer 'train' if present)
ds = datasets.load_dataset(HF_DATASET_ID)
print(ds)

README.md:   0%|          | 0.00/986 [00:00<?, ?B/s]

original-00000-of-00001.parquet:   0%|          | 0.00/6.02k [00:00<?, ?B/s]

augmented-00000-of-00001.parquet:   0%|          | 0.00/7.99k [00:00<?, ?B/s]

Generating original split:   0%|          | 0/36 [00:00<?, ? examples/s]

Generating augmented split:   0%|          | 0/134 [00:00<?, ? examples/s]

DatasetDict({
    original: Dataset({
        features: ['About how many hours per week do you spend listening to music?', 'Approximately how many songs are in your music library?', 'Approximately how many playlists have you created yourself?', 'How often do you share music with others?', 'Which decade of music do you listen to most?', 'How often do you attend live music events?', 'Do you prefer songs with lyrics or instrumental music?', 'Do you usually listen to music alone or with others?'],
        num_rows: 36
    })
    augmented: Dataset({
        features: ['About how many hours per week do you spend listening to music?', 'Approximately how many songs are in your music library?', 'Approximately how many playlists have you created yourself?', 'How often do you share music with others?', 'Which decade of music do you listen to most?', 'How often do you attend live music events?', 'Do you prefer songs with lyrics or instrumental music?', 'Do you usually listen to music alone or wit

In [None]:
# Convert to pandas for ease of manipulation
df = ds["original"].to_pandas()
display(df.head())

Unnamed: 0,About how many hours per week do you spend listening to music?,Approximately how many songs are in your music library?,Approximately how many playlists have you created yourself?,How often do you share music with others?,Which decade of music do you listen to most?,How often do you attend live music events?,Do you prefer songs with lyrics or instrumental music?,Do you usually listen to music alone or with others?
0,3.0,500,10,1,5,1,1,0
1,20.0,500,10,1,4,1,1,0
2,20.0,50,10,1,4,1,0,0
3,24.0,600,12,0,4,0,1,0
4,20.0,205,15,2,5,1,0,0


In [None]:
continuous_columns = df.columns[:3]
integer_columns = df.columns[3:]

## Augmentations

### Adding Numeric Jitter

In [None]:
def jitter_aug(
    X: pandas.DataFrame,
    continuous_cols: typing.Iterable[str],
    encoded_int_cols: typing.Iterable[str],
    noise_frac: float = 0.05,
    p_stay: float = 0.5,
    n_copies: int = 1,
) -> pandas.DataFrame:
    rng = numpy.random.default_rng()
    X = X.copy()
    cont = [c for c in continuous_cols if c in X.columns]
    enc  = [c for c in encoded_int_cols if c in X.columns]

    # precompute stds and bounds
    if cont:
        std = X[cont].astype(float).std(numeric_only=True).replace(0, 1e-8).values
    bounds = {c: (int(pandas.to_numeric(X[c], errors="coerce").min(skipna=True)),
                  int(pandas.to_numeric(X[c], errors="coerce").max(skipna=True)))
              for c in enc}

    aug = []
    for _ in range(n_copies):
        Xn = X.copy()
        # continuous noise
        if cont:
            eps = rng.standard_normal((len(Xn), len(cont))) * (std * noise_frac)
            Xn.loc[:, cont] = pandas.to_numeric(Xn[cont].stack(), errors="coerce").unstack().values + eps
        # encoded ±1 step
        for c in enc:
            base = pandas.to_numeric(Xn[c], errors="coerce").astype("Int64")
            mask = base.notna().to_numpy()
            if mask.sum() == 0:
                Xn[c] = base
                continue
            idx = numpy.where(mask)[0]
            move = rng.random(mask.sum()) > p_stay
            step = numpy.zeros(mask.sum(), dtype=float)
            step[move] = rng.choice([-1.0, 1.0], size=move.sum())
            out = base.to_numpy().astype(float)
            out[idx] = out[idx] + step
            lo, hi = bounds[c]
            out = numpy.clip(out, lo, hi)
            Xn[c] = pandas.Series(out, index=base.index).round().astype("Int64")
        aug.append(Xn)

    return pandas.concat(aug, ignore_index=True) if aug else pandas.DataFrame(columns=X.columns)

In [None]:
jitter_augmented_data = jitter_aug(
    df,
    continuous_columns,
    integer_columns,
    n_copies=1,
    noise_frac=0.05,
    p_stay=0.5,
)

  8.34530514e+01  2.90436883e+02  1.72833786e+02  2.89149989e+03
  1.00026140e+03  1.03103664e+03  9.64152774e+02  2.19746133e+01
 -8.67163854e+00  1.91435592e+03  1.67719729e+02  4.11164865e+03
  1.00214393e+04  3.08400554e+03  1.26032626e+02  2.04645349e+03
  6.14285567e+03  8.02291680e+03  2.07061818e+03  5.51641137e+02
  3.49613406e+03  5.00498120e+02  2.19485578e+03  4.03556185e+02
  9.32155943e+01  1.06467057e+03  4.40747986e+02  2.58755223e+03
  1.05157830e+03 -1.33818221e+02  1.10339948e+02  1.00254341e+01]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  Xn.loc[:, cont] = pandas.to_numeric(Xn[cont].stack(), errors="coerce").unstack().values + eps
   4.33256909   6.109726    39.2600608   -6.31314946  15.17066488
   2.80110822   4.3233468    6.69901754  72.60385129   6.08520814
 101.29854756 292.88601054   9.77788989  -0.62952987  -0.71819066
 417.3749281    3.31639989  20.01525175  -4.04499688   8.12459889
  -5.76293176  12.25960681   9.

In [None]:
display(jitter_augmented_data.head())

Unnamed: 0,About how many hours per week do you spend listening to music?,Approximately how many songs are in your music library?,Approximately how many playlists have you created yourself?,How often do you share music with others?,Which decade of music do you listen to most?,How often do you attend live music events?,Do you prefer songs with lyrics or instrumental music?,Do you usually listen to music alone or with others?
0,2.876721,416.907216,21.180168,2,4,0,0,0
1,21.958294,505.965505,8.081546,2,4,1,1,1
2,20.451194,4.443616,10.799642,1,4,1,1,0
3,23.66342,461.547642,13.739228,0,5,0,1,0
4,19.843991,83.453051,10.972434,1,5,1,0,1


### Augmenting with SMOTE

In [None]:
def smote_aug(
    df: pandas.DataFrame,
    target_col: str,
    encoded_int_cols: typing.Iterable[str],
    problem_type: typing.Literal["classification", "regression"],
    sampling_strategy: typing.Union[str, float, dict] = "auto",
    k_neighbors: int = 5,
) -> pandas.DataFrame:
    """
    Perform SMOTE-NC on a mixed (numeric + integer-encoded categorical) dataset.

    Parameters
    ----------
    df : pandas.DataFrame
        Full dataset including the target column.
    target_col : str
        Name of the target column in `df`.
    encoded_int_cols : Iterable[str]
        Columns in `df` that are integer-encoded categorical/ordinal *features*.
        (Do not include `target_col` here.)
    problem_type : {"classification","regression"}
        Augmentation is only performed for "classification". For "regression",
        an empty DataFrame is returned.
    sampling_strategy : Union[str, float, dict], default="auto"
        Passed to imblearn.over_sampling.SMOTENC.
    k_neighbors : int, default=5
        Passed to imblearn.over_sampling.SMOTENC.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing only the *newly synthesized* rows (features + target),
        with the same column order and dtypes as `df`. If augmentation is not
        performed or yields no new rows, returns an empty DataFrame with `df`'s schema.
    """
    # Basic guards
    if problem_type != "classification":
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    if target_col not in df.columns:
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    # Split features/target
    X = df.drop(columns=[target_col]).copy()
    y = df[target_col].copy()

    # Keep only encoded-int feature columns that exist (exclude target just in case)
    enc = [c for c in encoded_int_cols if c in X.columns]

    # Build categorical feature indices for SMOTENC
    cat_idx = [X.columns.get_loc(c) for c in enc]

    # Coerce features to numeric (SMOTE expects numeric); NaNs will be dropped
    X_num = X.copy()
    for c in X_num.columns:
        X_num[c] = pandas.to_numeric(X_num[c], errors="coerce")

    # Drop rows with any NaNs in features or target
    mask = X_num.notna().all(axis=1) & y.notna()
    Xv, yv = X_num.loc[mask], y.loc[mask]

    # Not enough rows to resample
    if len(Xv) < 2:
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    # Fit SMOTE-NC
    sm = imblearn.over_sampling.SMOTENC(
        categorical_features=cat_idx,
        sampling_strategy=sampling_strategy,
        k_neighbors=k_neighbors,
        random_state=0,
    )

    try:
        X_res, y_res = sm.fit_resample(Xv, yv)
    except Exception:
        # If SMOTE fails (e.g., only one minority sample, etc.), return empty
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    # If nothing new was created, bail out
    if len(X_res) <= len(Xv):
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    # Extract ONLY the newly synthesized rows
    n_new = len(X_res) - len(Xv)
    X_new = pandas.DataFrame(X_res.iloc[-n_new:], columns=Xv.columns).reset_index(drop=True)
    y_new = pandas.Series(y_res.iloc[-n_new:]).reset_index(drop=True)
    y_new.name = target_col

    # Re-merge features + target and restore original column order
    df_new = pandas.concat([X_new, y_new], axis=1)
    df_new = df_new[[c for c in df.columns]]  # same order as input

    # Cast back to original dtypes where possible
    cast_map = {}
    for c in df_new.columns:
        try:
            cast_map[c] = df[c].dtype
        except Exception:
            pass
    df_new = df_new.astype(cast_map, errors="ignore")

    return df_new

In [None]:
smote_augmented_data = smote_aug(
    df,
    "Do you usually listen to music alone or with others?",
    integer_columns,
    problem_type="classification",
    sampling_strategy="auto",
    k_neighbors=3,
)

In [None]:
display(smote_augmented_data.head())

Unnamed: 0,About how many hours per week do you spend listening to music?,Approximately how many songs are in your music library?,Approximately how many playlists have you created yourself?,How often do you share music with others?,Which decade of music do you listen to most?,How often do you attend live music events?,Do you prefer songs with lyrics or instrumental music?,Do you usually listen to music alone or with others?
0,11.31137,31,1,4,5,0,1,1
1,29.878115,1997,8,3,2,0,1,1
2,15.185375,36,3,3,4,2,2,1
3,32.635169,2631,14,3,2,0,1,1
4,10.428448,114,1,4,4,0,1,1


### Augment with Mixup

In [None]:
def mixup_aug(
    df: pandas.DataFrame,
    target_col: str,
    continuous_cols: typing.Iterable[str],
    encoded_int_cols: typing.Iterable[str],
    problem_type: typing.Literal["classification", "regression"],
    alpha: float = 0.4,
    n_samples: int | None = None,
    p_stay_enc: float = 0.5,
    step_size: int = 1,
) -> pandas.DataFrame:
    """
    MixUp-style augmentation for mixed tabular data.

    - Continuous columns use convex combinations with Beta(alpha, alpha) weights.
    - Integer-encoded categorical columns take a 1-step move toward the paired sample (or stay put).

    Parameters
    ----------
    df : pandas.DataFrame
        Full dataset including the target column.
    target_col : str
        Name of the target column in `df`.
    continuous_cols : Iterable[str]
        Feature columns to be treated as continuous (float-like).
    encoded_int_cols : Iterable[str]
        Feature columns that are integer-encoded categorical/ordinal (int-like).
        (Do not include `target_col` here.)
    problem_type : {"classification","regression"}
        Determines how labels are mixed (pick-one vs convex-combo).
    alpha : float, default=0.4
        Beta distribution parameter for MixUp.
    n_samples : int | None, default=None
        Number of synthetic samples to create (defaults to len(valid_rows)).
    p_stay_enc : float, default=0.5
        Probability that an encoded-int feature does NOT move toward its pair.
    step_size : int, default=1
        Step size (in integer units) for encoded-int features when they move.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing only the newly synthesized rows (features + target),
        with the same column order and dtypes as `df`. If augmentation is not
        possible (e.g., insufficient valid rows), returns an empty DataFrame
        with `df`'s schema.
    """
    # Basic guards and setup
    if target_col not in df.columns:
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    # Split features / target
    X = df.drop(columns=[target_col]).copy()
    y = df[target_col].copy()

    # Columns that actually exist in X
    cont = [c for c in continuous_cols if c in X.columns]
    enc  = [c for c in encoded_int_cols if c in X.columns]

    # Build a "valid" mask: we need numeric-able values in the columns we operate on
    X_work = X.copy()
    for c in cont:
        X_work[c] = pandas.to_numeric(X_work[c], errors="coerce")
    for c in enc:
        X_work[c] = pandas.to_numeric(X_work[c], errors="coerce")

    # For regression, the target must be numeric-able; for classification we just need it present
    if problem_type == "regression":
        y_work = pandas.to_numeric(y, errors="coerce")
        mask = X_work[cont + enc].notna().all(axis=1) & y_work.notna()
    else:
        y_work = y.copy()
        mask = X_work[cont + enc].notna().all(axis=1) & y.notna()

    Xv = X.loc[mask].reset_index(drop=True)
    yv = y_work.loc[mask].reset_index(drop=True)
    if len(Xv) < 2:
        return pandas.DataFrame(columns=df.columns).astype(df.dtypes.to_dict())

    # Default number of synthetic samples
    if n_samples is None:
        n_samples = len(Xv)

    rng = numpy.random.default_rng()
    i1 = rng.integers(0, len(Xv), size=n_samples)
    i2 = rng.integers(0, len(Xv), size=n_samples)
    lam = rng.beta(alpha, alpha, size=n_samples)

    X1 = Xv.iloc[i1].reset_index(drop=True).copy()
    X2 = Xv.iloc[i2].reset_index(drop=True)
    Xm = X1.copy()

    # Continuous columns: convex combination
    if cont:
        v1 = pandas.DataFrame({c: pandas.to_numeric(X1[c], errors="coerce") for c in cont}).to_numpy(dtype=float)
        v2 = pandas.DataFrame({c: pandas.to_numeric(X2[c], errors="coerce") for c in cont}).to_numpy(dtype=float)
        Xm.loc[:, cont] = lam.reshape(-1, 1) * v1 + (1.0 - lam).reshape(-1, 1) * v2

    # Encoded integer columns: 1-step move toward partner (or stay)
    if enc:
        # Compute per-column integer bounds from the valid subset
        bounds: dict[str, tuple[int, int]] = {}
        for c in enc:
            colnum = pandas.to_numeric(Xv[c], errors="coerce")
            bounds[c] = (int(colnum.min(skipna=True)), int(colnum.max(skipna=True)))

        move = rng.random(n_samples) > p_stay_enc
        for c in enc:
            a = pandas.to_numeric(X1[c], errors="coerce").to_numpy(dtype=float)
            b = pandas.to_numeric(X2[c], errors="coerce").to_numpy(dtype=float)
            out = a.copy()
            direction = numpy.sign(b - a)
            step = numpy.zeros_like(out)
            step[move] = direction[move] * float(step_size)
            lo, hi = bounds[c]
            out = numpy.clip(out + step, lo, hi)
            # Temporarily store as float; we'll cast back after re-merge
            Xm[c] = pandas.Series(out).round()

    # Labels
    if problem_type == "classification":
        y1 = yv.iloc[i1].reset_index(drop=True)
        y2 = yv.iloc[i2].reset_index(drop=True)
        choose_y1 = rng.random(n_samples) < lam
        ym = pandas.Series(numpy.where(choose_y1, y1, y2), dtype=y.dtype)
    else:
        y1 = pandas.to_numeric(yv.iloc[i1].reset_index(drop=True), errors="coerce").astype(float)
        y2 = pandas.to_numeric(yv.iloc[i2].reset_index(drop=True), errors="coerce").astype(float)
        ym = pandas.Series(lam * y1 + (1.0 - lam) * y2)

    # Re-attach target and restore column order
    df_new = pandas.concat([Xm.reset_index(drop=True), ym.rename(target_col)], axis=1)
    df_new = df_new[[c for c in df.columns]]  # original order

    # Cast back to original dtypes where possible
    # (Encoded-int columns will be coerced to the original dtype.)
    try:
        df_new = df_new.astype(df.dtypes.to_dict(), errors="ignore")
    except Exception:
        # Best-effort casting; ignore if some columns can't be cast cleanly
        pass

    return df_new

In [None]:
mixup_augmented_data = mixup_aug(
    df,
    "Do you usually listen to music alone or with others?",
    continuous_columns,
    integer_columns,
    problem_type="classification",
    alpha=0.4,
    n_samples=len(df),
    p_stay_enc=0.5,
    step_size=1,
)

 5609.85083104 1994.20830327 2196.33414205  299.10656426 2359.75258509
 1210.20079702  103.14782508  432.75735998   70.88978834  306.63326817
 5844.63583421  192.05639815 1937.91108611 1384.56622728  265.40811281
 1133.1219271  2975.1402002   998.85201922 1543.52137781  265.69127346
  105.72153125 2869.20420869  499.69477877  500.          996.89996843
   33.73495028  211.15952546 9999.72269314 3455.3873812   364.28766255
   86.24525891]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  Xm.loc[:, cont] = lam.reshape(-1, 1) * v1 + (1.0 - lam).reshape(-1, 1) * v2
 376.51883958  18.95475237  14.97167099   9.97563357  65.6098966
   5.987249     5.98600967   6.4913604    3.4533873    5.96205213
 174.91505827   4.31774407  18.67129399  18.64341792   7.65408113
  10.42208128  39.77764043  10.          37.9415927    9.31382547
   4.21434609  12.49419065  10.           9.9991638    2.02610553
   2.49398011   4.22319051 299.99137268  11.84169716  10.
   2.

In [None]:
display(mixup_augmented_data.head())

Unnamed: 0,About how many hours per week do you spend listening to music?,Approximately how many songs are in your music library?,Approximately how many playlists have you created yourself?,How often do you share music with others?,Which decade of music do you listen to most?,How often do you attend live music events?,Do you prefer songs with lyrics or instrumental music?,Do you usually listen to music alone or with others?
0,6.533959,7466,15,3,4,2,1,1
1,10.0,1670,4,2,5,2,1,1
2,6.916603,495,9,2,5,2,0,0
3,6.06993,40,3,1,4,0,1,0
4,39.845287,2994,12,4,5,1,1,1


### Augment with CTGAN

In [None]:
import typing
import pandas
import sdv.single_table
import sdv.metadata


def ctgan_fit_and_sample_single_table(
    data: pandas.DataFrame,
    n_rows: int,
    encoded_int_cols: typing.Iterable[str],
    table_name: str = "table",
    primary_key: str | None = None,
    epochs: int = 300,
    verbose: bool = True,
    enforce_rounding: bool = True,
    enforce_min_max_values: bool = True,
    cuda: bool | None = None,
) -> pandas.DataFrame:
    """
    Train a CTGAN synthesizer on a *single-table* DataFrame and return sampled rows.

    This follows the up-to-date SDV workflow:
      1) Build SingleTableMetadata and auto-detect from the DataFrame
      2) Optionally set a primary key
      3) Optionally override sdtypes (e.g., treat integer-encoded columns as 'categorical')
      4) Fit sdv.single_table.CTGANSynthesizer(metadata)
      5) sample(num_rows=n_rows)

    Args:
        data: pandas DataFrame (real data to learn from)
        n_rows: number of synthetic rows to sample
        table_name: logical name for the table in metadata
        primary_key: optional column to mark as primary key (unique ids)
        encoded_int_as_categorical: columns that are integer-encoded choices; will be forced to 'categorical'
        epochs: GAN epochs (defaults to 300 per SDV docs)
        verbose: print per-epoch losses if True
        enforce_rounding: match decimal digits of numeric real data
        enforce_min_max_values: keep numeric outputs within observed min/max
        cuda: True/False/None (None = let SDV decide; True requires CUDA available)

    Returns:
        pandas.DataFrame with n_rows synthetic samples (same columns as input, order preserved).
    """
    if not isinstance(data, pandas.DataFrame):
        raise TypeError("`data` must be a pandas.DataFrame")

    # --- 1) Detect metadata from the DataFrame ---
    metadata = sdv.metadata.SingleTableMetadata()
    # Detect column sdtypes and (optionally) keys from the DF
    metadata.detect_from_dataframe(data=data)

    # --- 2) Primary key (optional) ---
    if primary_key is not None:
        if primary_key not in data.columns:
            raise ValueError(f"primary_key '{primary_key}' not found in columns")
        # Modern SDV supports programmatic PK updates via the Metadata API.
        # Use set_primary_key when available, otherwise update sdtype to 'id'.
        if hasattr(metadata, "set_primary_key"):
            metadata.set_primary_key(column_name=primary_key)
        else:
            metadata.update_column(column_name=primary_key, sdtype="id")

    # --- 3) Override sdtypes for integer-encoded categorical columns (optional) ---
    if len(encoded_int_cols):
        for col in encoded_int_cols:
            if col in data.columns:
                metadata.update_column(column_name=col, sdtype="categorical")

    # --- 4) Create & fit the CTGAN synthesizer ---
    # Build kwargs only when set to avoid surprising defaults across versions.
    synth_kwargs: dict[str, typing.Any] = {
        "enforce_rounding": enforce_rounding,
        "enforce_min_max_values": enforce_min_max_values,
        "epochs": epochs,
        "verbose": verbose,
    }
    if cuda is not None:
        synth_kwargs["cuda"] = bool(cuda)

    synthesizer = sdv.single_table.CTGANSynthesizer(metadata, **synth_kwargs)
    synthesizer.fit(data)

    # --- 5) Sample synthetic rows ---
    synthetic = synthesizer.sample(num_rows=int(n_rows))

    # Reorder columns to match original DataFrame (nice-to-have)
    synthetic = synthetic.reindex(columns=list(data.columns), copy=False)

    return synthetic

In [None]:
ctgan_augmented_data = ctgan_fit_and_sample_single_table(
    df,
    n_rows=len(df),
    encoded_int_cols=integer_columns,
    table_name="table",
    primary_key=None,
    epochs=300,
    verbose=False,
    enforce_rounding=True,
    enforce_min_max_values=True,
    cuda=False,
)



In [None]:
display(ctgan_augmented_data.head())

Unnamed: 0,About how many hours per week do you spend listening to music?,Approximately how many songs are in your music library?,Approximately how many playlists have you created yourself?,How often do you share music with others?,Which decade of music do you listen to most?,How often do you attend live music events?,Do you prefer songs with lyrics or instrumental music?,Do you usually listen to music alone or with others?
0,27.2,30,1,3,5,1,0,0
1,20.1,2899,41,1,5,1,0,0
2,17.8,1787,8,1,5,2,1,0
3,30.1,30,30,1,4,2,2,0
4,13.4,30,7,1,2,2,0,0


## Combine and save the data

In [None]:
# Concatenate the original and augmented dataframes
augmented_df = pandas.concat([
    jitter_augmented_data,
    smote_augmented_data,
    mixup_augmented_data,
    ctgan_augmented_data
    ]
                             )


In [None]:
augmented_df_clean = augmented_df.copy()

# Fix integer columns
int_cols = [c for c in df.columns if pandas.api.types.is_integer_dtype(df[c])]
for c in int_cols:
    if c in augmented_df_clean.columns:
        augmented_df_clean[c] = (
            pandas.to_numeric(augmented_df_clean[c], errors="coerce")
            .round()
            .astype(df[c].dtype)  # preserves int64 vs Int64
        )

# Reorder and drop extras
augmented_df_clean = augmented_df_clean[df.columns]

In [None]:

new_ds = datasets.DatasetDict({
    "original": datasets.Dataset.from_pandas(df, preserve_index=False),
    "augmented": datasets.Dataset.from_pandas(augmented_df_clean, preserve_index=False),
})

print(new_ds)

DatasetDict({
    original: Dataset({
        features: ['About how many hours per week do you spend listening to music?', 'Approximately how many songs are in your music library?', 'Approximately how many playlists have you created yourself?', 'How often do you share music with others?', 'Which decade of music do you listen to most?', 'How often do you attend live music events?', 'Do you prefer songs with lyrics or instrumental music?', 'Do you usually listen to music alone or with others?'],
        num_rows: 36
    })
    augmented: Dataset({
        features: ['About how many hours per week do you spend listening to music?', 'Approximately how many songs are in your music library?', 'Approximately how many playlists have you created yourself?', 'How often do you share music with others?', 'Which decade of music do you listen to most?', 'How often do you attend live music events?', 'Do you prefer songs with lyrics or instrumental music?', 'Do you usually listen to music alone or wit

In [None]:
huggingface_hub.notebook_login()

In [None]:
new_ds.push_to_hub("2025-24679-tabular-dataset")