In [None]:
print("Hello, this is my MSc bias project")

Hello, this is my MSc bias project


In [None]:
print("Next: I will download a public disease dataset (e.g. pneumonia or sepsis) with age, sex, outcome columns and upload its CSV here.")


Next: I will download a public disease dataset (e.g. pneumonia or sepsis) with age, sex, outcome columns and upload its CSV here.


In [None]:
import pandas as pd

# replace with your real file name if different
data = pd.read_csv("/content/Covid_Dataset.csv")

data.head()
data.columns


Index(['USMER', 'MEDICAL_UNIT', 'SEX', 'PATIENT_TYPE', 'DATE_DIED', 'INTUBED',
       'PNEUMONIA', 'AGE', 'PREGNANT', 'DIABETES', 'COPD', 'ASTHMA', 'INMSUPR',
       'HIPERTENSION', 'OTHER_DISEASE', 'CARDIOVASCULAR', 'OBESITY',
       'RENAL_CHRONIC', 'TOBACCO', 'CLASIFFICATION_FINAL', 'ICU'],
      dtype='object')

In [None]:
import numpy as np

# 1) Keep only rows where ICU information is present
data = data[data["ICU"].notna()]

# 2) Quick look at the codes
print("SEX codes:", data["SEX"].unique())
print("ICU codes:", data["ICU"].unique())


SEX codes: [1 2]
ICU codes: [0 1]


In [None]:
# 3) Define target: icu_yes = 1 if patient went to ICU (severe), else 0
data["icu_yes"] = np.where(data["ICU"] != 0, 1, 0)

# 4) Define sex_female: 1 = female, 0 = male
# (TEMP mapping – we may adjust after seeing codes)
data["sex_female"] = np.where(data["SEX"] == 2, 1, 0)

# 5) Create simple age groups: 0=<40, 1=40–60, 2=>60
def make_age_group(age):
    if age < 40:
        return 0
    elif age <= 60:
        return 1
    else:
        return 2

data["age_group"] = data["AGE"].apply(make_age_group)

# Check
print(data[["SEX","sex_female","AGE","age_group","ICU","icu_yes"]].head())


   SEX  sex_female  AGE  age_group  ICU  icu_yes
0    1           0   65          2    0        0
1    2           1   72          2    0        0
2    2           1   55          1    1        1
3    1           0   53          1    0        0
4    2           1   68          2    0        0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score

# 1. Select features (X) and target (y)
feature_cols = [
    "AGE",
    "PNEUMONIA",
    "DIABETES",
    "COPD",
    "ASTHMA",
    "HIPERTENSION",
    "OTHER_DISEASE",
    "CARDIOVASCULAR",
    "OBESITY",
    "RENAL_CHRONIC",
    "TOBACCO"
]

X = data[feature_cols].copy()
y = data["icu_yes"].copy()

# 2. Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y    # keep ICU proportion similar in train and test
)

print("Train size:", X_train.shape[0])
print("Test size:", X_test.shape[0])

# 3. Train simple Logistic Regression
model_lr = LogisticRegression(max_iter=1000)
model_lr.fit(X_train, y_train)

# 4. Predictions
y_pred = model_lr.predict(X_test)
y_proba = model_lr.predict_proba(X_test)[:, 1]

# 5. Basic metrics
acc = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_proba)
rec = recall_score(y_test, y_pred)

print("\nBaseline Logistic Regression performance:")
print("Accuracy:", round(acc, 3))
print("AUC:", round(auc, 3))
print("Recall:", round(rec, 3))


Train size: 159999
Test size: 40000

Baseline Logistic Regression performance:
Accuracy: 0.78
AUC: 0.819
Recall: 0.567


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score

# 1. Get sex_female values for the test rows
sex_test = data.loc[X_test.index, "sex_female"]

# 2. Create a small results table by sex
results_by_sex = []

for value, name in [(0, "Male (sex_female=0)"),
                    (1, "Female (sex_female=1)")]:
    mask = (sex_test == value)
    y_true_g = y_test[mask]
    y_pred_g = y_pred[mask]

    acc_g = accuracy_score(y_true_g, y_pred_g)
    rec_g = recall_score(y_true_g, y_pred_g)

    results_by_sex.append({
        "Group": name,
        "N_test_patients": len(y_true_g),
        "Accuracy": round(acc_g, 3),
        "Recall": round(rec_g, 3)
    })

pd.DataFrame(results_by_sex)


Unnamed: 0,Group,N_test_patients,Accuracy,Recall
0,Male (sex_female=0),19028,0.809,0.592
1,Female (sex_female=1),20972,0.753,0.55


In [None]:
# positive prediction rates and parity metrics
pos_rate_male = np.mean(y_pred[sex_test == 0])
pos_rate_fem  = np.mean(y_pred[sex_test == 1])

dp_diff = pos_rate_fem - pos_rate_male
di_ratio = pos_rate_fem / pos_rate_male if pos_rate_male > 0 else np.nan

print("Male positive rate:", round(pos_rate_male, 3))
print("Female positive rate:", round(pos_rate_fem, 3))
print("Demographic Parity Difference (female - male):", round(dp_diff, 3))
print("Disparate Impact Ratio (female / male):", round(di_ratio, 3))


Male positive rate: 0.244
Female positive rate: 0.287
Demographic Parity Difference (female - male): 0.043
Disparate Impact Ratio (female / male): 1.177


In [None]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score

# compute balanced class weights for ICU vs non-ICU
classes = np.array([0, 1])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
cw = {0: class_weights[0], 1: class_weights[1]}
print("Class weights:", cw)

# train weighted logistic regression
model_lr_w = LogisticRegression(max_iter=1000, class_weight=cw)
model_lr_w.fit(X_train, y_train)

# predictions
y_pred_w = model_lr_w.predict(X_test)
y_proba_w = model_lr_w.predict_proba(X_test)[:, 1]

# overall metrics
acc_w  = accuracy_score(y_test, y_pred_w)
auc_w  = roc_auc_score(y_test, y_proba_w)
rec_w  = recall_score(y_test, y_pred_w)

print("Weighted model - Accuracy:", round(acc_w, 3))
print("Weighted model - AUC:", round(auc_w, 3))
print("Weighted model - Recall:", round(rec_w, 3))


Class weights: {0: np.float64(0.7652159357214596), 1: np.float64(1.442628124211058)}
Weighted model - Accuracy: 0.76
Weighted model - AUC: 0.82
Weighted model - Recall: 0.732


In [None]:
results_by_sex_w = []

for value, name in [(0, "Male (sex_female=0)"),
                    (1, "Female (sex_female=1)")]:
    mask = (sex_test == value)
    y_true_g = y_test[mask]
    y_pred_g = y_pred_w[mask]

    acc_g = accuracy_score(y_true_g, y_pred_g)
    rec_g = recall_score(y_true_g, y_pred_g)

    results_by_sex_w.append({
        "Group": name,
        "N_test_patients": len(y_true_g),
        "Accuracy": round(acc_g, 3),
        "Recall": round(rec_g, 3)
    })

pd.DataFrame(results_by_sex_w)


Unnamed: 0,Group,N_test_patients,Accuracy,Recall
0,Male (sex_female=0),19028,0.768,0.74
1,Female (sex_female=1),20972,0.753,0.727


In [None]:
pos_rate_male_w = np.mean(y_pred_w[sex_test == 0])
pos_rate_fem_w  = np.mean(y_pred_w[sex_test == 1])

dp_diff_w = pos_rate_fem_w - pos_rate_male_w
di_ratio_w = pos_rate_fem_w / pos_rate_male_w if pos_rate_male_w > 0 else np.nan

print("Weighted - Male positive rate:", round(pos_rate_male_w, 3))
print("Weighted - Female positive rate:", round(pos_rate_fem_w, 3))
print("Weighted - Demographic Parity Difference (female - male):", round(dp_diff_w, 3))
print("Weighted - Disparate Impact Ratio (female / male):", round(di_ratio_w, 3))


Weighted - Male positive rate: 0.37
Weighted - Female positive rate: 0.428
Weighted - Demographic Parity Difference (female - male): 0.058
Weighted - Disparate Impact Ratio (female / male): 1.156
