<a href="https://colab.research.google.com/github/nmansour67/skills-introduction-to-github/blob/main/Fix_Oversampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler

# --- Step 1: Simulate a Biased Dataset ---
np.random.seed(42)

n_patients = 1000
genders = np.random.choice(['Male', 'Female'], size=n_patients, p=[0.67, 0.33])
ages = np.random.randint(30, 90, size=n_patients)

chest_pain = np.random.binomial(1, p=np.where(genders == 'Male', 0.7, 0.4))
nausea = np.random.binomial(1, p=np.where(genders == 'Female', 0.6, 0.2))
fatigue = np.random.binomial(1, p=np.where(genders == 'Female', 0.5, 0.3))

heart_attack = np.random.binomial(1, p=np.where(genders == 'Male', 0.3, 0.15))

df = pd.DataFrame({
    'Gender': genders,
    'Age': ages,
    'Chest_Pain': chest_pain,
    'Nausea': nausea,
    'Fatigue': fatigue,
    'Heart_Attack': heart_attack
})

print("Original Gender Distribution:")
print(df['Gender'].value_counts(normalize=True))

# --- Step 2: Prepare Data for Training ---
X = df.drop(columns=['Heart_Attack'])
y = df['Heart_Attack']

# Split into train and test sets (stratify by Heart_Attack and Gender)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=df[['Heart_Attack', 'Gender']]
)

print("\nGender Distribution in Test Set:")
print(X_test['Gender'].value_counts(normalize=True))

# --- Step 3: Oversample Female Patients to 50% ---
# Separate male and female patients
X_train_male = X_train[X_train['Gender'] == 'Male']
X_train_female = X_train[X_train['Gender'] == 'Female']
y_train_male = y_train[X_train['Gender'] == 'Male']
y_train_female = y_train[X_train['Gender'] == 'Female']

# Calculate the number of samples needed for 50/50 balance
n_male = len(X_train_male)
n_female = len(X_train_female)
n_samples_needed = n_male - n_female

# Oversample female patients to match the number of male patients
ros = RandomOverSampler(sampling_strategy='minority', random_state=42)
X_train_female_resampled, y_train_female_resampled = ros.fit_resample(X_train_female, y_train_female)

# Combine oversampled female data with original male data
X_train_balanced = pd.concat([X_train_male, pd.DataFrame(X_train_female_resampled, columns=X_train.columns)])
y_train_balanced = pd.concat([pd.Series(y_train_male), pd.Series(y_train_female_resampled)])

# Shuffle the balanced dataset
shuffle_idx = np.random.permutation(len(X_train_balanced))
X_train_balanced = X_train_balanced.iloc[shuffle_idx]
y_train_balanced = y_train_balanced.iloc[shuffle_idx]

print("\nGender Distribution After Oversampling (Training Data):")
print(X_train_balanced['Gender'].value_counts(normalize=True))

# --- Step 4: Retrain Model on Gender-Balanced Data ---
model_fair = LogisticRegression(max_iter=1000)
model_fair.fit(X_train_balanced.drop(columns=['Gender']), y_train_balanced)

# --- Step 5: Evaluate Model Performance ---
X_test_no_gender = X_test.drop(columns=['Gender'])
y_pred = model_fair.predict(X_test_no_gender)

test_audit = X_test.copy()
test_audit['Actual'] = y_test
test_audit['Predicted_Fair'] = y_pred

female_acc = accuracy_score(
    test_audit[test_audit['Gender'] == 'Female']['Actual'],
    test_audit[test_audit['Gender'] == 'Female']['Predicted_Fair']
)

male_acc = accuracy_score(
    test_audit[test_audit['Gender'] == 'Male']['Actual'],
    test_audit[test_audit['Gender'] == 'Male']['Predicted_Fair']
)

print(f"\nAccuracy for Women (Fair Model): {female_acc * 100:.1f}%")
print(f"Accuracy for Men (Fair Model): {male_acc * 100:.1f}%")

# --- Step 6: Compare with Original Model (Trained on Biased Data) ---
model_biased = LogisticRegression(max_iter=1000)
model_biased.fit(X_train.drop(columns=['Gender']), y_train)
y_pred_biased = model_biased.predict(X_test_no_gender)

test_audit['Predicted_Biased'] = y_pred_biased

female_acc_biased = accuracy_score(
    test_audit[test_audit['Gender'] == 'Female']['Actual'],
    test_audit[test_audit['Gender'] == 'Female']['Predicted_Biased']
)

male_acc_biased = accuracy_score(
    test_audit[test_audit['Gender'] == 'Male']['Actual'],
    test_audit[test_audit['Gender'] == 'Male']['Predicted_Biased']
)

print(f"\nAccuracy for Women (Biased Model): {female_acc_biased * 100:.1f}%")
print(f"Accuracy for Men (Biased Model): {male_acc_biased * 100:.1f}%")


Original Gender Distribution:
Gender
Male      0.688
Female    0.312
Name: proportion, dtype: float64

Gender Distribution in Test Set:
Gender
Male      0.686667
Female    0.313333
Name: proportion, dtype: float64

Gender Distribution After Oversampling (Training Data):
Gender
Male      0.559165
Female    0.440835
Name: proportion, dtype: float64

Accuracy for Women (Fair Model): 80.9%
Accuracy for Men (Fair Model): 68.9%

Accuracy for Women (Biased Model): 87.2%
Accuracy for Men (Biased Model): 69.4%
