# Step 1: Fetch and Log Raw Data

In [1]:
!pip install -q wandb


[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import wandb

In [3]:
def fetch_and_save_data(input_path='diabetes.csv', output_path='raw_data.csv'):
    if not os.path.exists(input_path):
        raise FileNotFoundError(f"Input data file {input_path} does not exist.") # Ensure the file exists
    df = pd.read_csv(input_path)
    df.to_csv(output_path, index=False)
    print(f"Loaded {df.shape[0]} rows and {df.shape[1]} columns.")
    print(f"Saved to {output_path}.")
    return output_path

In [4]:
def log_data_to_wandb(output_path, artifact_name='raw_data.csv', project='diabetes', entity='ngocnhi-p4work-national-economics-university'):
    wandb.login()
    run = wandb.init(project=project, entity=entity, job_type='fetch_data')
    artifact = wandb.Artifact(name=artifact_name, type='dataset', description='Raw diabetes data')
    artifact.add_file(output_path)
    run.log_artifact(artifact)
    run.finish()
    print(f"Logged {artifact_name} to Weights & Biases.")

In [6]:
# === RUN PIPELINE ===
output_file = fetch_and_save_data('diabetess.csv')
log_data_to_wandb(output_file)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Loaded 768 rows and 9 columns.
Saved to raw_data.csv.


[34m[1mwandb[0m: Currently logged in as: [33mmlcolongmay[0m ([33mmlcolongmay-neu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Logged raw_data.csv to Weights & Biases.
