In [1]:
import requests
import zipfile
from collections import Counter
import os

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import shap
import matplotlib.pyplot as plt
import numpy as np
import dotenv

pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 20)


In [2]:
dotenv.load_dotenv()

True

In [3]:
if not os.path.exists('data/fetal-health-classification.zip'):
    r = requests.get('https://www.kaggle.com/api/v1/datasets/download/andrewmvd/fetal-health-classification')
    with open('data/fetal-health-classification.zip', 'wb') as f:
        f.write(r.content)
    
if not os.path.exists('data/fetal_health.csv'):
    with zipfile.ZipFile('data/fetal-health-classification.zip', 'r') as zip_ref:
        zip_ref.extractall('.')


df = pd.read_csv(r"data/fetal_health.csv")
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,2.4,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,10.4,130.0,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,13.4,130.0,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,23.0,117.0,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,19.9,117.0,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [4]:
X = df.drop('fetal_health', axis=1)
# Map classes 1,2,3 -> 0,1,2
y = df['fetal_health'] - 1  # subtract 1 from all labels

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


In [5]:
X_test[:1] , y_test[:1]

(      baseline value  accelerations  fetal_movement  uterine_contractions  \
 1354           132.0          0.005             0.0                 0.002   
 
       light_decelerations  severe_decelerations  prolongued_decelerations  \
 1354                0.003                   0.0                       0.0   
 
       abnormal_short_term_variability  mean_value_of_short_term_variability  \
 1354                             25.0                                   1.5   
 
       percentage_of_time_with_abnormal_long_term_variability  \
 1354                                                0.0        
 
       mean_value_of_long_term_variability  histogram_width  histogram_min  \
 1354                                 11.0             92.0           76.0   
 
       histogram_max  histogram_number_of_peaks  histogram_number_of_zeroes  \
 1354          168.0                        4.0                         2.0   
 
       histogram_mode  histogram_mean  histogram_median  histogram_varia

In [None]:
# X_test.to_csv('data/ctg_X_test.csv', index=False)

In [6]:
X[:1]

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,mean_value_of_long_term_variability,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,2.4,64.0,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0


In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pickle

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")
# save the trained Random Forest model
with open('models/random_forest_model_stillbirth.pkl', 'wb') as f:
    pickle.dump(rf_clf, f)

Random Forest Accuracy: 0.9248826291079812


In [7]:
# Check original distribution
print("Before SMOTE:", Counter(y_train))

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Check new distribution
print("After SMOTE:", Counter(y_train_res))


Before SMOTE: Counter({0.0: 1323, 1.0: 236, 2.0: 141})
After SMOTE: Counter({2.0: 1323, 0.0: 1323, 1.0: 1323})


In [None]:
X = [1,2]
y = [0,1]

In [24]:

with open("models/tabpfn_model.pt", "rb") as f:
    clf = torch.load(f, weights_only=True)

UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL tabpfn.classifier.TabPFNClassifier was not an allowed global by default. Please use `torch.serialization.add_safe_globals([tabpfn.classifier.TabPFNClassifier])` or the `torch.serialization.safe_globals([tabpfn.classifier.TabPFNClassifier])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [26]:
from tabpfn import TabPFNClassifier

# Initialize the classifier
clf = TabPFNClassifier()
clf.load_from_fit_state("models/tabpfn_model.tabpfn_fit")

# # Fit the model
# clf.fit(X_train, y_train)
# clf.fit(X, y)

# # Make predictions
predictions = clf.predict(X_test)

accuracy = (predictions == y_test).mean()
print(f"Accuracy: {accuracy:.4f}")

KeyboardInterrupt: 

In [None]:
model = xgb.XGBClassifier(
    objective='multi:softmax',
    # objective='multi:softprob',  # outputs probabilities
    num_class=3,
    eval_metric='mlogloss',
    use_label_encoder=False,
    random_state=42
)

# Train on SMOTE-resampled data
model.fit(X_train_res, y_train_res)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# save the trained model
model.save_model("models/fetal_xgb_model.json")

In [None]:
explainer = shap.Explainer(model.predict_proba, X_test)
shap_values = explainer(X_test)

In [None]:
shap.save_html("output/shap_fetal_health.html", shap_values)

In [None]:
shap.summary_plot(shap_values, X_test, max_display=10)

In [None]:
# The shap_values object from the previous cell already contains the values for each class
# We can directly use it for plotting

# Plot the summary plot with feature importance for each class
shap.summary_plot(shap_values, X_test, plot_type="bar")

In [None]:
N = 50  # number of samples to plot
samples = np.arange(N)
# Probabilities for each class
y_proba = model.predict_proba(X_test)

plt.figure(figsize=(12,6))
plt.bar(samples, y_proba[:N,0], label='Class 0 (Normal)')
plt.bar(samples, y_proba[:N,1], bottom=y_proba[:N,0], label='Class 1 (Suspect)')
plt.bar(samples, y_proba[:N,2], bottom=y_proba[:N,0]+y_proba[:N,1], label='Class 2 (Pathological)')

plt.xlabel('Test Samples')
plt.ylabel('Predicted Probability')
plt.title('Predicted Probabilities for Test Samples')
plt.legend()
plt.show()


In [None]:
# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# Map numerical classes to labels
class_map = {0: "Normal", 1: "Suspect", 2: "Pathological"}

def patient_risk_with_recommendation(patient_data: pd.DataFrame):
    """
    patient_data: single-row dataframe with same features as training set
    Returns predicted class, top 3 SHAP features, and patient recommendation
    """
    # 1️⃣ Predict probabilities and class
    probs = model.predict_proba(patient_data)[0]
    pred_class_idx = np.argmax(probs)
    pred_class_name = class_map[pred_class_idx]

    # 2️⃣ Compute SHAP values for this patient
    shap_values = explainer(patient_data)
    shap_vals_for_class = shap_values[:, :, pred_class_idx].values[0]

    # 3️⃣ Create a dataframe for features and SHAP values
    feature_df = pd.DataFrame({
        "feature": patient_data.columns,
        "shap_value": shap_vals_for_class,
        "feature_value": patient_data.iloc[0].values
    }).sort_values(by="shap_value", key=abs, ascending=False)

    # 4️⃣ Top 3 most important features
    top_features = feature_df.head(3)

    # 5️⃣ Generate recommendations based on SHAP values and class
    recommendations = []
    for idx, row in top_features.iterrows():
        feature, value, shap_val = row['feature'], row['feature_value'], row['shap_value']
        if pred_class_name == "Normal":
            # Positive features reassuring
            if shap_val > 0:
                recommendations.append(f"{feature} ({value}) supports normal fetal health.")
            else:
                recommendations.append(f"{feature} ({value}) slightly reduces reassurance, monitor routinely.")
        elif pred_class_name == "Suspect":
            if shap_val > 0:
                recommendations.append(f"{feature} ({value}) increases risk; closer monitoring recommended.")
            else:
                recommendations.append(f"{feature} ({value}) slightly reduces risk, but patient still at suspect level.")
        elif pred_class_name == "Pathological":
            if shap_val > 0:
                recommendations.append(f"{feature} ({value}) strongly indicates high risk; urgent monitoring/intervention required.")
            else:
                recommendations.append(f"{feature} ({value}) reduces risk but patient still at pathological level.")

    # 6️⃣ Output
    return {
        "predicted_class": pred_class_name,
        "predicted_probabilities": probs,
        "top_features": top_features,
        "recommendations": recommendations
    }

# --------------------------
# Example usage
# --------------------------
patient_example = X_test.iloc[[0]]  # pick one patient from test set
result = patient_risk_with_recommendation(patient_example)

print("Predicted Class:", result['predicted_class'])
print("Predicted Probabilities:", result['predicted_probabilities'])
print("\nTop Features Driving Prediction:\n", result['top_features'])
print("\nRecommendations:\n", "\n".join(result['recommendations']))
