<a href="https://colab.research.google.com/github/nmansour67/skills-introduction-to-github/blob/main/The_Fem_Data_Scrub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm, bernoulli

# Set random seed for reproducibility
np.random.seed(42)

# Number of patients
n_patients = 1000

# Generate patient IDs
patient_ids = [f"P{str(i).zfill(4)}" for i in range(1, n_patients + 1)]

# Generate gender (65% male, 35% female, reflecting real-world heart disease prevalence)
genders = np.random.choice(['Male', 'Female'], size=n_patients, p=[0.65, 0.35])

# Generate ages (skewed toward older adults, higher risk for heart disease)
ages = truncnorm.rvs(a=(30-60)/15, b=(90-60)/15, loc=60, scale=15, size=n_patients).astype(int)

# Generate symptoms with realistic correlations and gender/age dependencies

# Chest pain: More common in males and older patients
chest_pain_prob = np.where(genders == 'Male', 0.6, 0.4)  # Baseline: 60% males, 40% females
chest_pain_prob = np.where(ages > 60, chest_pain_prob + 0.15, chest_pain_prob)  # +15% if age > 60
chest_pain_prob = np.clip(chest_pain_prob, 0.3, 0.8)  # Ensure probabilities stay within reasonable bounds
chest_pain = bernoulli.rvs(chest_pain_prob)

# Nausea: More common in females and those with chest pain
nausea_prob = np.where(genders == 'Female', 0.3, 0.1)  # Baseline: 30% females, 10% males
nausea_prob = np.where(chest_pain == 1, nausea_prob + 0.1, nausea_prob)  # +10% if chest pain present
nausea_prob = np.clip(nausea_prob, 0.1, 0.4)
nausea = bernoulli.rvs(nausea_prob)

# Fatigue: More common in older patients and those with nausea
fatigue_prob = np.where(ages > 60, 0.4, 0.2)  # Baseline: 40% if age > 60, 20% otherwise
fatigue_prob = np.where(nausea == 1, fatigue_prob + 0.15, fatigue_prob)  # +15% if nausea present
fatigue_prob = np.clip(fatigue_prob, 0.15, 0.55)
fatigue = bernoulli.rvs(fatigue_prob)

# Generate heart attack labels (25% overall prevalence, higher in males and older patients)
heart_attack_prob = np.where(genders == 'Male', 0.3, 0.15)  # Baseline: 30% males, 15% females
heart_attack_prob = np.where(ages > 60, heart_attack_prob + 0.15, heart_attack_prob)  # +15% if age > 60
heart_attack_prob = np.where(chest_pain == 1, heart_attack_prob + 0.1, heart_attack_prob)  # +10% if chest pain present
heart_attack_prob = np.clip(heart_attack_prob, 0.1, 0.5)
heart_attack = bernoulli.rvs(heart_attack_prob)

# Create DataFrame
df = pd.DataFrame({
    'Patient_ID': patient_ids,
    'Gender': genders,
    'Age': ages,
    'Chest_Pain': chest_pain,
    'Nausea': nausea,
    'Fatigue': fatigue,
    'Heart_Attack': heart_attack
})

# Display dataset info
print("Realistic Heart Patient Dataset Generated Successfully!")
print(f"\nTotal Patients: {len(df)}")
print("\nFirst 10 records:")
print(df.head(10))

print("\nGender Distribution:")
print(df['Gender'].value_counts(normalize=True))

print("\nAge Distribution:")
print(df['Age'].describe())

print("\nSymptom Distribution by Gender:")
print(df.groupby('Gender')[['Chest_Pain', 'Nausea', 'Fatigue']].mean())

print("\nHeart Attack Prevalence by Gender:")
print(df.groupby('Gender')['Heart_Attack'].mean())

print("\nHeart Attack Prevalence by Age Group:")
age_bins = [30, 40, 50, 60, 70, 80, 90]
age_labels = ['30-39', '40-49', '50-59', '60-69', '70-79', '80-90']
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels)
print(df.groupby('Age_Group')['Heart_Attack'].mean())

# Save to CSV
df.to_csv('realistic_heart_patient_dataset.csv', index=False)
print("\nDataset saved as 'realistic_heart_patient_dataset.csv'")


Realistic Heart Patient Dataset Generated Successfully!

Total Patients: 1000

First 10 records:
  Patient_ID  Gender  Age  Chest_Pain  Nausea  Fatigue  Heart_Attack
0      P0001    Male   47           1       0        0             0
1      P0002  Female   61           1       1        0             0
2      P0003  Female   75           0       0        1             1
3      P0004    Male   68           1       0        0             0
4      P0005    Male   72           1       0        0             1
5      P0006    Male   65           0       0        0             0
6      P0007    Male   67           1       1        1             1
7      P0008  Female   74           0       0        0             1
8      P0009    Male   50           1       0        0             0
9      P0010  Female   59           0       0        0             0

Gender Distribution:
Gender
Male      0.671
Female    0.329
Name: proportion, dtype: float64

Age Distribution:
count    1000.000000
mean      

  print(df.groupby('Age_Group')['Heart_Attack'].mean())
