# Import Libraries

In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
import joblib
import warnings
warnings.filterwarnings('ignore')

# Load and Clean Data

In [16]:
# Load dataset
df = pd.read_csv("Preeclampsia.csv")

# Remove irrelevant column if exists
df = df.drop(columns=["Unnamed: 12"], errors='ignore')

# Strip whitespace from column names
df.columns = df.columns.str.strip()

# Rename columns for clarity
df.rename(columns={
    "Age (yrs)": "Age",
    "BMI  [kg/m²]": "BMI",
    "History of hypertension (y/n)": "Hypertension_History",
    "gestational age (weeks)": "Gestational_Age_Weeks",
    "fetal weight(kgs)": "Fetal_Weight",
    "Protien Uria": "Protein_Uria",
    "Uterine Artery Doppler Resistance Index (RI)": "Uterine_RI",
    "Uterine Artery Doppler Pulsatility Index (PI": "Uterine_PI",
    "amniotic fluid levels(cm)": "Amniotic_Fluid_Levels"
}, inplace=True)

# Convert numeric columns
numeric_cols = [
    'gravida', 'parity', 'Gestational_Age_Weeks', 'Age', 'BMI', 
    'Systolic BP', 'Diastolic BP', 'HB', 'Fetal_Weight', 
    'Uterine_RI', 'Uterine_PI', 'Amniotic_Fluid_Levels'
]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with critical missing values
df.dropna(subset=['Age', 'Systolic BP', 'Diastolic BP', 'Fetal_Weight'], inplace=True)

# Fill remaining NaNs with column mean
df.fillna(df.mean(numeric_only=True), inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)

# Display info
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6264 entries, 0 to 6263
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   gravida                6264 non-null   float64
 1   parity                 6264 non-null   float64
 2   Gestational_Age_Weeks  6264 non-null   float64
 3   Age                    6264 non-null   float64
 4   BMI                    6264 non-null   float64
 5   diabetes               6264 non-null   object 
 6   Hypertension_History   6264 non-null   object 
 7   Systolic BP            6264 non-null   float64
 8   Diastolic BP           6264 non-null   float64
 9   HB                     6264 non-null   float64
 10  Fetal_Weight           6264 non-null   float64
 11  Protein_Uria           6264 non-null   object 
 12  Uterine_RI             6264 non-null   float64
 13  Uterine_PI             6264 non-null   float64
 14  Amniotic_Fluid_Levels  6264 non-null   float64
 15  Risk

# Remove Duplicates and Additional irrelevant columns

In [17]:
# Remove duplicate header row if exists
df = df[df['gravida'] != 'gravida']

# Drop other irrelevant columns if present
columns_to_drop = ['Uterine_RI', 'Uterine_PI']
df = df.drop(columns=columns_to_drop, errors='ignore')

# Data Type Checks and Handling

In [18]:
# Confirm data types
print(df.dtypes)

# Check nulls
print(df.isnull().sum())

gravida                  float64
parity                   float64
Gestational_Age_Weeks    float64
Age                      float64
BMI                      float64
diabetes                  object
Hypertension_History      object
Systolic BP              float64
Diastolic BP             float64
HB                       float64
Fetal_Weight             float64
Protein_Uria              object
Amniotic_Fluid_Levels    float64
Risk_level                object
dtype: object
gravida                  0
parity                   0
Gestational_Age_Weeks    0
Age                      0
BMI                      0
diabetes                 0
Hypertension_History     0
Systolic BP              0
Diastolic BP             0
HB                       0
Fetal_Weight             0
Protein_Uria             0
Amniotic_Fluid_Levels    0
Risk_level               0
dtype: int64


# Binary Columns Processing

In [19]:
# Define binary columns
binary_columns = ['diabetes', 'Hypertension_History', 'Protein_Uria']

# Imputer for binary columns
bin_imputer = SimpleImputer(strategy='most_frequent')

# Convert binary columns to 0/1
for col in binary_columns:
    if col in df.columns:
        df[col] = df[col].apply(lambda x: 1 if str(x).lower() in ['y', '1', 'yes', 'true']
                              else 0 if str(x).lower() in ['n', '0', 'no', 'false']
                              else np.nan)
df[binary_columns] = bin_imputer.fit_transform(df[binary_columns])

#  Feature Engineering

In [20]:
#feature Engineering
# BP ratio
df['BP_ratio'] = df['Systolic BP'] / df['Diastolic BP'].replace(0, 1)
df['BP_ratio'].replace([np.inf, -np.inf], df['BP_ratio'].median(), inplace=True)

# BMI category
df['BMI_category'] = pd.cut(df['BMI'], 
                             bins=[0, 18.5, 25, 30, 100], 
                             labels=[0, 1, 2, 3], 
                             include_lowest=True)
df['BMI_category'] = df['BMI_category'].cat.codes
df['BMI_category'].replace(-1, 1, inplace=True)

# Final Data Checks & Encoding

In [21]:
# Check for NaNs
if df.isna().any().any():
    print("NaNs found in columns:", df.columns[df.isna().any()].tolist())
    df = df.fillna(df.median(numeric_only=True))
    
# Encode target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Risk_level'] = le.fit_transform(df['Risk_level'])

# Prepare Features and Target

In [22]:
X = df.drop('Risk_level', axis=1)
y = df['Risk_level']

# Standardize features
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Final NaN check
if X.isna().any().any():
    print("NaNs in X after preprocessing:", X.columns[X.isna().any()].tolist())

# Model Training with SMOTE and Hyperparameter Tuning

In [23]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Initialize model
xgb_model = XGBClassifier(random_state=42, eval_metric='mlogloss')

# Hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1]
}

# Grid Search
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

# Cross-validation score
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy: {:.2f} ± {:.2f}".format(cv_scores.mean(), cv_scores.std()))

Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
Cross-Validation Accuracy: 0.81 ± 0.01


# Model Evaluation

In [24]:
# Predict on test set
y_pred = best_model.predict(X_test)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

        high       0.78      0.70      0.74       782
         low       0.80      0.82      0.81       708
        low        1.00      1.00      1.00       764
         mid       0.67      0.72      0.69       732

    accuracy                           0.81      2986
   macro avg       0.81      0.81      0.81      2986
weighted avg       0.81      0.81      0.81      2986


Confusion Matrix:
[[549  51   1 181]
 [ 44 582   0  82]
 [  0   0 764   0]
 [107  98   1 526]]


# Save Model and Artifacts

In [25]:
# Save model and preprocessing objects
joblib.dump(best_model, 'models\preeclampsia_model_improved.pkl')
joblib.dump(le, 'models\label_encoder_improved.pkl')
joblib.dump(scaler, 'models\scaler.pkl')
joblib.dump(X_train.columns, 'models\columns_improved.pkl')
print("Model and artifacts saved.")

Model and artifacts saved.


# Define Load and Predict Function

In [26]:
def load_and_predict(new_data, model_path='models\preeclampsia_model_improved.pkl', 
                     le_path='models\label_encoder_improved.pkl', 
                     scaler_path='models\scaler.pkl', 
                     columns_path='models\columns_improved.pkl'):
    model = joblib.load(model_path)
    label_encoder = joblib.load(le_path)
    scaler = joblib.load(scaler_path)
    saved_columns = joblib.load(columns_path)
    
    # Feature engineering
    new_data['BP_ratio'] = new_data['Systolic BP'] / new_data['Diastolic BP'].replace(0, 1)
    new_data['BP_ratio'].replace([np.inf, -np.inf], new_data['BP_ratio'].median(), inplace=True)

    new_data['BMI_category'] = pd.cut(new_data['BMI'], 
                                     bins=[0, 18.5, 25, 30, 100], 
                                     labels=[0, 1, 2, 3], 
                                     include_lowest=True)
    new_data['BMI_category'] = new_data['BMI_category'].cat.codes
    new_data['BMI_category'].replace(-1, 1, inplace=True)

    # Reindex columns
    new_data = new_data.reindex(columns=saved_columns, fill_value=0)
    
    # Scale
    new_data = scaler.transform(new_data)
    
    # Predict
    predictions = model.predict(new_data)
    predicted_labels = label_encoder.inverse_transform(predictions)
    return predicted_labels

# Example Prediction

In [27]:
# Example new data
new_data = pd.DataFrame({
    'gravida': [3], 'parity': [1], 'Gestational_Age_Weeks': [22.2], 'Age': [25],
    'BMI': [21.0], 'diabetes': [0], 'Hypertension_History': [0],
    'Systolic BP': [110], 'Diastolic BP': [70], 'HB': [9.3], 'Fetal_Weight': [0.501],
    'Protein_Uria': [0], 'Amniotic_Fluid_Levels': [10.0]
})

# Make prediction
predictions = load_and_predict(new_data)
print("\nPrediction for new data:", predictions)


Prediction for new data: ['low']


In [32]:
# Example : High risk profile
new_data2 = pd.DataFrame({
    'gravida': [4],
    'parity': [0],
    'Gestational_Age_Weeks': [20],
    'Age': [35],
    'BMI': [32],
    'diabetes': [1],
    'Hypertension_History': [1],
    'Systolic BP': [160],
    'Diastolic BP': [110],
    'HB': [8.0],
    'Fetal_Weight': [0.4],
    'Protein_Uria': [1],
    'Amniotic_Fluid_Levels': [8]
})

prediction2 = load_and_predict(new_data2)
print("Prediction:", prediction2)

Prediction: ['high']
