<a href="https://colab.research.google.com/github/madhudevi25/data-bias-auditor-agent/blob/main/bias_auditor_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===============================
# STEP 0: Setup & Imports
# ===============================
# Authenticate with Google Cloud and import libraries
from google.colab import auth
auth.authenticate_user()

from google.cloud import storage, bigquery
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from google.colab import files
import zipfile, os, datetime

print("✅ Libraries imported and Google Cloud authenticated")


✅ Libraries imported and Google Cloud authenticated


In [12]:
# ===============================
# STEP 1: Define Dataset-Specific Values
# ===============================
# Update these values for healthcare / retail / finance
DATASET_NAME = "healthcare"
KAGGLE_ID = "fedesoriano/stroke-prediction-dataset"
CSV_NAME = "healthcare-dataset-stroke-data.csv"  # inside zip

PROJECT_ID = "genai-bias-bi-auditor-agent"
BQ_DATASET = "genaiBias_auditor_db"
BQ_TABLE = DATASET_NAME

BUCKET_NAME = "genai-bias_auditor_storage"

FEATURES = ["age"]        # independent variables
LABEL = "stroke"          # target variable
SENSITIVE_COL = "gender"  # column to check bias

print(f"✅ Dataset-specific values set for {DATASET_NAME}")


✅ Dataset-specific values set for healthcare


In [13]:
# ===============================
# STEP 2: Delete Existing GCS Bucket (if exists)
# ===============================
# This ensures a clean start without conflicts

storage_client = storage.Client(project=PROJECT_ID)

try:
    bucket = storage_client.get_bucket(BUCKET_NAME)
    # Delete all objects first
    blobs = list(bucket.list_blobs())
    for blob in blobs:
        blob.delete()
    # Delete bucket
    bucket.delete()
    print(f"✅ Deleted existing bucket {BUCKET_NAME} and all its contents")
except Exception as e:
    print(f"⚠ Bucket not found or already deleted: {e}")

# Recreate bucket
bucket = storage_client.bucket(BUCKET_NAME)
bucket.location = "US"
bucket = storage_client.create_bucket(bucket)
print(f"✅ Created fresh bucket: {BUCKET_NAME}")


✅ Deleted existing bucket genai-bias_auditor_storage and all its contents


  bucket.location = "US"


✅ Created fresh bucket: genai-bias_auditor_storage


In [6]:
# ===============================
# STEP 3.1:  Get Kaggle API Token
# ===============================
# Without the file, the kaggle command cannot authenticate and download datasets.

from google.colab import files
uploaded = files.upload()  # Choose the kaggle.json file


Saving kaggle.json to kaggle.json


In [7]:
# ===============================
# STEP 3.2: Move Kaggle Token to Correct Location
# ===============================
#
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [14]:
# ===============================
# STEP 4: Pull Dataset #1 from Kaggle
# ===============================

!mkdir -p ./data/{DATASET_NAME}
!kaggle datasets download -d {KAGGLE_ID} -p ./data/{DATASET_NAME}

# Extract zip
import zipfile
with zipfile.ZipFile(f"./data/{DATASET_NAME}/{KAGGLE_ID.split('/')[-1]}.zip", 'r') as zip_ref:
    zip_ref.extractall(f"./data/{DATASET_NAME}")

print("✅ Dataset downloaded and extracted:")
os.listdir(f"./data/{DATASET_NAME}")

Dataset URL: https://www.kaggle.com/datasets/fedesoriano/stroke-prediction-dataset
License(s): copyright-authors
stroke-prediction-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
✅ Dataset downloaded and extracted:


['stroke-prediction-dataset.zip', 'healthcare-dataset-stroke-data.csv']

In [10]:
#List extracted files to confirm the exact CSV name:

import os

extracted_files = os.listdir(f"./data/{DATASET_NAME}")
print("Files extracted:", extracted_files)


Files extracted: ['stroke-prediction-dataset.zip', 'healthcare-dataset-stroke-data.csv']


In [15]:
# ===============================
# STEP 5: Upload CSV to GCS
# ===============================
file_path = f"./data/{DATASET_NAME}/{CSV_NAME}"

blob = bucket.blob(f"raw/{CSV_NAME}")
blob.upload_from_filename(file_path)
print(f"✅ Uploaded CSV to GCS: {BUCKET_NAME}/raw/{CSV_NAME}")


✅ Uploaded CSV to GCS: genai-bias_auditor_storage/raw/healthcare-dataset-stroke-data.csv


In [16]:
# ===============================
# STEP 6: Load CSV into BigQuery
# ===============================
table_id = f"{PROJECT_ID}.{BQ_DATASET}.{BQ_TABLE}"

job_config = bigquery.LoadJobConfig(
    source_format=bigquery.SourceFormat.CSV,
    skip_leading_rows=1,
    autodetect=True
)

uri = f"gs://{BUCKET_NAME}/raw/{CSV_NAME}"
load_job = client.load_table_from_uri(uri, table_id, job_config=job_config)
load_job.result()
print(f"✅ Loaded CSV into BigQuery table: {table_id}")


✅ Loaded CSV into BigQuery table: genai-bias-bi-auditor-agent.genaiBias_auditor_db.healthcare


In [17]:
# ===============================
# STEP 7: Query Data from BigQuery
# ===============================
query = f"SELECT * FROM `{table_id}`"
df = client.query(query).to_dataframe()
print(f"✅ Retrieved {len(df)} rows from BigQuery")
df.head()


✅ Retrieved 5110 rows from BigQuery


Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,47350,Female,0.08,0,0,False,children,Urban,139.67,14.1,Unknown,0
1,29955,Male,0.08,0,0,False,children,Rural,70.33,16.9,Unknown,0
2,22877,Male,0.16,0,0,False,children,Urban,114.71,17.4,Unknown,0
3,41500,Male,0.16,0,0,False,children,Rural,69.79,13.0,Unknown,0
4,8247,Male,0.16,0,0,False,children,Urban,109.52,13.9,Unknown,0


In [18]:
# ===============================
# STEP 8: Preprocess, Train Model, Predict
# ===============================
X = df[FEATURES].fillna(0)
y = df[LABEL]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model = LogisticRegression()
model.fit(X_scaled, y)
y_pred = model.predict(X_scaled)

print("✅ Model trained and predictions generated")


✅ Model trained and predictions generated


In [19]:
# ===============================
# STEP 9: Compute Bias Metrics
# ===============================
bias_summary = []
for val in df[SENSITIVE_COL].unique():
    mask = df[SENSITIVE_COL] == val
    accuracy = np.mean(y_pred[mask] == y[mask])
    bias_summary.append({
        "Dataset": DATASET_NAME,
        "Sensitive Feature": SENSITIVE_COL,
        "Group": val,
        "Accuracy": round(accuracy, 3)
    })

bias_df = pd.DataFrame(bias_summary)
display(bias_df)


Unnamed: 0,Dataset,Sensitive Feature,Group,Accuracy
0,healthcare,gender,Female,0.953
1,healthcare,gender,Male,0.949
2,healthcare,gender,Other,1.0


In [20]:
# ===============================
# STEP 10: Save & Upload Bias Report
# ===============================
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
report_file = f"{DATASET_NAME}_bias_report_{timestamp}.csv"

bias_df.to_csv(report_file, index=False)

# Upload to GCS reports folder
blob = bucket.blob(f"reports/{report_file}")
blob.upload_from_filename(report_file)

print(f"✅ Bias report uploaded to GCS: {BUCKET_NAME}/reports/{report_file}")

# Optional: download in Colab
files.download(report_file)


✅ Bias report uploaded to GCS: genai-bias_auditor_storage/reports/healthcare_bias_report_20250920_012759.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>