# predicts whether a woman is likely to be pregnant or not, based on various factors such as:

- PCOS status (whether the woman has PCOS or not),
- Hormonal levels (e.g., FSH, LH, AMH),
- BMI,
- Cycle length,
- Lifestyle factors like exercise and fast food consumption,
- Other health metrics such as blood pressure and follicle sizes.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
#Read the Dataset
df = pd.read_csv('../datasets/PCOS_data.csv')
df.head()

# Display basic information and the first few rows
print(df.info())
display(df.head())

# Generate descriptive statistics
display(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3    Age (yrs)              541 non-null    int64  
 4   Weight (Kg)             541 non-null    float64
 5   Height(Cm)              541 non-null    float64
 6   BMI                     541 non-null    float64
 7   Blood Group             541 non-null    int64  
 8   Pulse rate(bpm)         541 non-null    int64  
 9   RR (breaths/min)        541 non-null    int64  
 10  Hb(g/dl)                541 non-null    float64
 11  Cycle(R/I)              541 non-null    int64  
 12  Cycle length(days)      541 non-null    int64  
 13  Marraige Status (Yrs)   540 non-null    float64
 14  Pregnant(Y/N)           541 non-null    in

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,0,1.0,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0,0.0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1,1.0,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0,0.0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0,0.0,0,120,80,3,4,16.0,14.0,7.0


Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
count,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,...,541.0,540.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0
mean,271.0,271.0,0.327172,31.430684,59.637153,156.484835,24.307579,13.802218,73.247689,19.243993,...,0.489834,0.514815,0.247689,114.661738,76.927911,6.12939,6.641405,15.018115,15.451701,8.475915
std,156.317519,156.317519,0.469615,5.411006,11.028287,6.033545,4.055129,1.840812,4.430285,1.688629,...,0.500359,0.500244,0.43207,7.384556,5.574112,4.229294,4.436889,3.566839,3.318848,2.165381
min,1.0,1.0,0.0,20.0,31.0,137.0,12.4,11.0,13.0,16.0,...,0.0,0.0,0.0,12.0,8.0,0.0,0.0,0.0,0.0,0.0
25%,136.0,136.0,0.0,28.0,52.0,152.0,21.6,13.0,72.0,18.0,...,0.0,0.0,0.0,110.0,70.0,3.0,3.0,13.0,13.0,7.0
50%,271.0,271.0,0.0,31.0,59.0,156.0,24.2,14.0,72.0,18.0,...,0.0,1.0,0.0,110.0,80.0,5.0,6.0,15.0,16.0,8.5
75%,406.0,406.0,1.0,35.0,65.0,160.0,26.6,15.0,74.0,20.0,...,1.0,1.0,0.0,120.0,80.0,9.0,10.0,18.0,18.0,9.8
max,541.0,541.0,1.0,48.0,108.0,180.0,38.9,18.0,82.0,28.0,...,1.0,1.0,1.0,140.0,100.0,22.0,20.0,24.0,24.0,18.0


Sl. No                    0
Patient File No.          0
PCOS (Y/N)                0
 Age (yrs)                0
Weight (Kg)               0
Height(Cm)                0
BMI                       0
Blood Group               0
Pulse rate(bpm)           0
RR (breaths/min)          0
Hb(g/dl)                  0
Cycle(R/I)                0
Cycle length(days)        0
Marraige Status (Yrs)     1
Pregnant(Y/N)             0
No. of abortions          0
  I   beta-HCG(mIU/mL)    0
II    beta-HCG(mIU/mL)    0
FSH(mIU/mL)               0
LH(mIU/mL)                0
FSH/LH                    0
Hip(inch)                 0
Waist(inch)               0
Waist:Hip Ratio           0
TSH (mIU/L)               0
AMH(ng/mL)                0
PRL(ng/mL)                0
Vit D3 (ng/mL)            0
PRG(ng/mL)                0
RBS(mg/dl)                0
Weight gain(Y/N)          0
hair growth(Y/N)          0
Skin darkening (Y/N)      0
Hair loss(Y/N)            0
Pimples(Y/N)              0
Fast food (Y/N)     

In [3]:

# Display cleaned column names for reference
print(df.columns.tolist())

['Sl. No', 'Patient File No.', 'PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)', 'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ', 'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)', 'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of abortions', '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio', 'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)', 'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)', 'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)', 'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)']


In [4]:
# 5. Normalize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')
print(df.columns.tolist())

['Sl._No', 'Patient_File_No.', 'PCOS_(Y/N)', 'Age_(yrs)', 'Weight_(Kg)', 'Height(Cm)', 'BMI', 'Blood_Group', 'Pulse_rate(bpm)', 'RR_(breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle_length(days)', 'Marraige_Status_(Yrs)', 'Pregnant(Y/N)', 'No._of_abortions', 'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip_Ratio', 'TSH_(mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit_D3_(ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight_gain(Y/N)', 'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 'Hair_loss(Y/N)', 'Pimples(Y/N)', 'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)', 'Follicle_No._(L)', 'Follicle_No._(R)', 'Avg._F_size_(L)_(mm)', 'Avg._F_size_(R)_(mm)', 'Endometrium_(mm)']


In [5]:
# Correctly handle numeric columns with special characters and potential non-numeric data
numeric_columns = ['BMI', 'Age_(yrs)', 'Weight_(Kg)', 'Waist:Hip_Ratio', 
                   'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 
                   'FSH(mIU/mL)', 'LH(mIU/mL)', 'AMH(ng/mL)', 
                   'Cycle_length(days)', 'Endometrium_(mm)', 
                   'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col].replace(r'^\.+$', '', regex=True), errors='coerce')
    df[col] = df[col].fillna(df[col].median())

# Handle categorical columns and fill missing values with mode
categorical_columns = ['PCOS_(Y/N)', 'Pregnant(Y/N)', 'Weight_gain(Y/N)', 
                       'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 
                       'Hair_loss(Y/N)', 'Pimples(Y/N)', 
                       'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 
                       'Blood_Group']

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [6]:
# Fill missing numeric values with median
df.fillna(df.median(), inplace=True)


In [8]:
print(df.columns.tolist())

['Sl._No', 'Patient_File_No.', 'PCOS_(Y/N)', 'Age_(yrs)', 'Weight_(Kg)', 'Height(Cm)', 'BMI', 'Blood_Group', 'Pulse_rate(bpm)', 'RR_(breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle_length(days)', 'Marraige_Status_(Yrs)', 'Pregnant(Y/N)', 'No._of_abortions', 'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip_Ratio', 'TSH_(mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit_D3_(ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight_gain(Y/N)', 'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 'Hair_loss(Y/N)', 'Pimples(Y/N)', 'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)', 'Follicle_No._(L)', 'Follicle_No._(R)', 'Avg._F_size_(L)_(mm)', 'Avg._F_size_(R)_(mm)', 'Endometrium_(mm)', 'BMI_range']


In [10]:
# Feature engineering
df['BMI_range'] = pd.cut(df['BMI'], bins=[0, 18.5, 24.9, 29.9, float('inf')], labels=[0, 1, 2, 3])
df['Age_group'] = pd.cut(df['Age_(yrs)'], bins=[0, 30, 40, 50, float('inf')], labels=[0, 1, 2, 3])

# Select features and target for predicting fertility outcome based on PCOS and other factors
X = df.drop(['Pregnant(Y/N)', 'Sl._No', 'Patient_File_No.'], axis=1)
y = df['Pregnant(Y/N)']  # Target column

In [11]:
# Encoding categorical features
X = pd.get_dummies(X)


In [12]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [14]:
# Initialize the models
models = {
    'XGBClassifier': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LGBMClassifier': LGBMClassifier(),
    'CatBoostClassifier': CatBoostClassifier(verbose=0)
}

In [15]:
# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} Accuracy: {accuracy:.4f}')


Parameters: { "use_label_encoder" } are not used.



XGBClassifier Accuracy: 0.8991
[LightGBM] [Info] Number of positive: 168, number of negative: 264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1751
[LightGBM] [Info] Number of data points in the train set: 432, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388889 -> initscore=-0.451985
[LightGBM] [Info] Start training from score -0.451985
LGBMClassifier Accuracy: 0.9083
CatBoostClassifier Accuracy: 0.9266


In [16]:
# Stacking classifiers for improved accuracy
estimators = [
    ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
    ('lgbm', LGBMClassifier()),
    ('cat', CatBoostClassifier(verbose=0))
]

stack_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), cv=5)
stack_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 168, number of negative: 264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1751
[LightGBM] [Info] Number of data points in the train set: 432, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388889 -> initscore=-0.451985
[LightGBM] [Info] Start training from score -0.451985


Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 134, number of negative: 211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1455
[LightGBM] [Info] Number of data points in the train set: 345, number of used features: 47
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388406 -> initscore=-0.454018
[LightGBM] [Info] Start training from score -0.454018
[LightGBM] [Info] Number of positive: 134, number of negative: 211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000188 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1464
[LightGBM] [Info] Number of data points in the train set: 345, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.388406 -> initscore=-0.454018
[LightGBM] [Info] Start training from score -0.454018
[LightGBM] [Info] Number

In [17]:
# Prediction and evaluation
y_pred_stack = stack_clf.predict(X_test)
accuracy_stack = accuracy_score(y_test, y_pred_stack)
print(f'Accuracy of the stacked model: {accuracy_stack:.4f}')

Accuracy of the stacked model: 0.9266


# Accuracy of the stacked model: 0.9266