# Step 5: Data Segregation
This notebook splits the preprocessed data into training and testing sets using stratified sampling, then logs each split as separate artifacts to Weights & Biases (W&B).

In [1]:
!pip install -q wandb


[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import logging
import tempfile
import pandas as pd
from sklearn.model_selection import train_test_split
import wandb

In [3]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(message)s',
                    datefmt='%d-%m-%Y %H:%M:%S')
logger = logging.getLogger()

In [4]:
def load_artifact_data(project, artifact_path, entity="ngocnhi-p4work-national-economics-university"):
    wandb.login()
    run = wandb.init(project=project,entity=entity, job_type='data_segregation')
    artifact = run.use_artifact(artifact_path)
    artifact_dir = artifact.download()
    df = pd.read_csv(os.path.join(artifact_dir, 'preprocessed_data.csv'))
    logger.info(f"Loaded dataset with shape: {df.shape}")
    return df, run

In [5]:
def split_data(df, test_size=0.3, seed=42, stratify_col='OUTCOME'):
    train_df, test_df = train_test_split(
        df, test_size=test_size, random_state=seed, stratify=df[stratify_col]
    )
    logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
    return {'train': train_df, 'test': test_df}

In [6]:
def log_splits_to_wandb(splits, run, artifact_type='segregated_data'):
    with tempfile.TemporaryDirectory() as tmp_dir:
        for name, split_df in splits.items():
            temp_path = os.path.join(tmp_dir, f"{name}.csv")
            split_df.to_csv(temp_path, index=False)
            artifact = wandb.Artifact(
                name=f"{name}.csv",
                type=artifact_type,
                description=f"{name} split of dataset"
            )
            artifact.add_file(temp_path)
            run.log_artifact(artifact)
            artifact.wait()
            logger.info(f"Uploaded {name}.csv to W&B")

In [7]:
# === RUN PIPELINE ===
project = 'diabetes'
artifact_path = 'ngocnhi-p4work-national-economics-university/diabetes/preprocessed_data.csv:latest'
df, run = load_artifact_data(project, artifact_path)
splits = split_data(df)
log_splits_to_wandb(splits, run)
run.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmlcolongmay[0m ([33mmlcolongmay-neu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  
03-05-2025 22:47:26 Loaded dataset with shape: (768, 9)
03-05-2025 22:47:26 Train shape: (537, 9), Test shape: (231, 9)
03-05-2025 22:47:29 Uploaded train.csv to W&B
03-05-2025 22:47:33 Uploaded test.csv to W&B
