In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load the cleaned diabetes dataset
df = pd.read_csv("featured_cleaned_diabetes_prediction_dataset.csv")

# 2. First split: 80% train_val, 20% test (stratified)
train_val, test = train_test_split(
    df,
    test_size=0.20,
    stratify=df["diabetes"],
    random_state=42
)

# 3. Second split: from the 80%, make 75% train (i.e. 60% total) and 25% val (i.e. 20% total)
train, val = train_test_split(
    train_val,
    test_size=0.25,                # 0.25 * 0.80 = 0.20 of original
    stratify=train_val["diabetes"],
    random_state=42
)

# 4. Save to CSV
train.to_csv("train.csv", index=False)
val.to_csv("val.csv",     index=False)
test.to_csv("test.csv",    index=False)

print("Saved splits:")
print(f"  train.csv ({len(train)} rows)")
print(f"  val.csv   ({len(val)} rows)")
print(f"  test.csv  ({len(test)} rows)")


Saved splits:
  train.csv (57687 rows)
  val.csv   (19229 rows)
  test.csv  (19230 rows)


In [6]:
# Check class balance
train_df = pd.read_csv("train.csv")
print("Class distribution:\n", train_df['diabetes'].value_counts())


Class distribution:
 diabetes
0    54900
1     5100
Name: count, dtype: int64
