In [1]:
import pandas as pd
from pathlib import Path
import os 

def sample_and_save_dataframe(
    df: pd.DataFrame,
    n_rows: int = 100,
    columns: list[str] | None = None,
    output_path: str = "data/sample.csv",
    file_format: str | None = None,
    random_state: int = 42,
    shuffle: bool = True,
) -> pd.DataFrame:
    """
    Take a full pandas DataFrame, sample a subset, and save it.

    Parameters
    ----------
    df : pd.DataFrame
        Full input dataframe.
    n_rows : int, default 100
        Number of rows to include in the sample.
    columns : list of str or None, default None
        Optional list of columns to keep. If None, keep all columns.
    output_path : str, default "data/sample.csv"
        File path where the sampled dataset will be saved.
    file_format : {"csv", "parquet"} or None
        Output format. If None, it is inferred from the file extension.
    random_state : int, default 42
        Random seed for reproducible sampling.
    shuffle : bool, default True
        Whether to randomly sample rows. If False, just take top n_rows.

    Returns
    -------
    sampled_df : pd.DataFrame
        The sampled subset of the dataframe.
    """
    # Optional column selection
    if columns is not None:
        missing = set(columns) - set(df.columns)
        if missing:
            raise ValueError(f"These columns are not in df: {missing}")
        df_to_sample = df[columns]
    else:
        df_to_sample = df

    # Row sampling
    if shuffle:
        sampled_df = df_to_sample.sample(
            n=min(n_rows, len(df_to_sample)),
            random_state=random_state
        )
    else:
        sampled_df = df_to_sample.head(n_rows)

    # Infer format from extension if needed
    path = Path(output_path)
    if file_format is None:
        ext = path.suffix.lower().lstrip(".")
        if ext in {"csv", "parquet"}:
            file_format = ext
        else:
            raise ValueError(
                "Could not infer file_format from extension. "
                "Please set file_format='csv' or 'parquet'."
            )

    # Save to disk
    path.parent.mkdir(parents=True, exist_ok=True)

    if file_format == "csv":
        sampled_df.to_csv(path, index=False)
    elif file_format == "parquet":
        sampled_df.to_parquet(path, index=False)
    else:
        raise ValueError("file_format must be 'csv' or 'parquet'.")

    print(f"Saved sample with {len(sampled_df)} rows to: {path}")
    return sampled_df


In [2]:
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "../" , "data"))
DATA_RAW_DIR = Path(os.path.join(DATA_DIR, "raw"))
DATA_PROCESSED_DIR = Path(os.path.join(DATA_DIR, "processed"))
raw_path = Path(os.path.join(DATA_RAW_DIR, "oxygen.csv"))
raw_path_sample = Path(os.path.join(DATA_RAW_DIR, "oxygen_sample.csv"))

In [3]:
df_raw = pd.read_csv(raw_path)

In [4]:
sampled = sample_and_save_dataframe(
    df_raw,
    n_rows=10000,
    output_path=raw_path_sample
)

Saved sample with 10000 rows to: /Users/musarratrahman/Documents/anomaly_detection_and_forecasting_solution/data/raw/oxygen_sample.csv
