# 📚 Synthetic Menopause Dataset Creation

In [None]:
# -- Imports --
import pandas as pd
import numpy as np
import random

# -- Seed for reproducibility --
random.seed(42)
np.random.seed(42)

# -- Generate Subject Names --
names = [f"Subject_{i+1}" for i in range(20)]

In [None]:
# -- Helper Function --
def random_choice(options, weights=None):
    return random.choices(options, weights=weights, k=1)[0]

# -- Dataset Generation Function --
def generate_dataset():
    data = []
    for name in names:
        age = random.randint(35, 55)
        bmi = round(random.uniform(18.5, 35.0), 1)
        smoking = random_choice(["Yes", "No"], [0.3, 0.7])
        alcohol = random_choice(["None", "Low", "Moderate", "High"], [0.2, 0.4, 0.3, 0.1])
        physical_activity = random_choice(["Low", "Moderate", "High"], [0.3, 0.5, 0.2])
        parity = random.randint(0, 5)
        age_first_child = random.randint(18, 35) if parity > 0 else None
        oral_contraceptive = random_choice(["Yes", "No"], [0.6, 0.4])
        education = random_choice(["Primary", "Secondary", "Tertiary"], [0.2, 0.4, 0.4])
        employment = random_choice(["Employed", "Unemployed"])
        socioeconomic = random_choice(["Low", "Middle", "High"], [0.3, 0.5, 0.2])
        ethnicity = random_choice(["Caucasian", "Asian", "African", "Hispanic"])
        menstrual_irregularity = random_choice(["Yes", "No"], [0.4, 0.6])
        mother_menopause_age = random.randint(40, 55)
        environmental_exposure = random_choice(["Yes", "No"], [0.2, 0.8])
        breastfeeding = random_choice(["None", "<6 months", "6+ months"], [0.2, 0.4, 0.4])
        sexual_activity = random_choice(["Weekly", "Monthly", "Rarely"], [0.4, 0.4, 0.2])

        menopause_age = (
            50
            + (0.5 if smoking == "No" else -1)
            + (0.5 if alcohol in ["Low", "Moderate"] else -0.5)
            + (1 if physical_activity == "High" else 0)
            + (0.3 * parity)
            + (1 if oral_contraceptive == "Yes" else -0.5)
            + (0.5 if education == "Tertiary" else -0.5)
            + (1 if socioeconomic == "High" else -1)
            + (0.5 if breastfeeding == "6+ months" else -0.5)
            + (0.5 if sexual_activity == "Weekly" else -0.5)
            + np.random.normal(0, 1)
        )
        menopause_age = round(min(max(menopause_age, 40), 60), 1)

        data.append([
            name, age, bmi, parity, mother_menopause_age, menopause_age
        ])

    columns = [
        "Name", "Current_Age", "BMI", "Parity", "Mother_Menopause_Age", "Menopause_Age"
    ]

    return pd.DataFrame(data, columns=columns)

# -- Extend Dataset with Biomarkers --
def extend_dataset(df):
    df = df.copy()
    df["FSH_Level_mIU_mL"] = np.round(np.random.normal(25, 10, size=len(df)), 1)
    df["LH_Level_mIU_mL"] = np.round(np.random.normal(20, 7, size=len(df)), 1)
    df["Estradiol_pg_mL"] = np.round(np.random.normal(50, 20, size=len(df)), 1)
    return df

# -- Generate and Extend Dataset --
df = generate_dataset()
extended_df = extend_dataset(df)

In [None]:
# -- View the dataset --
extended_df.head()