In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
import optuna
import numpy as np



In [2]:
# Read the Dataset
df = pd.read_csv('../datasets/PCOS_data.csv')

In [3]:
# Data Cleaning and Preprocessing
df.columns = df.columns.str.strip().str.replace(' ', '_')

In [6]:
# Correctly handle numeric columns with special characters and potential non-numeric data
numeric_columns = ['BMI', 'Age_(yrs)', 'Weight_(Kg)', 'Waist:Hip_Ratio', 
                   'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 
                   'FSH(mIU/mL)', 'LH(mIU/mL)', 'AMH(ng/mL)', 
                   'Cycle_length(days)', 'Endometrium_(mm)', 
                   'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)',
                   'Follicle_No._(L)', 'Follicle_No._(R)', 'Avg._F_size_(L)_(mm)', 'Avg._F_size_(R)_(mm)']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col].replace(r'^\.+$', '', regex=True), errors='coerce')
    df[col] = df[col].fillna(df[col].median())

# Handle categorical columns and fill missing values with mode
categorical_columns = ['PCOS_(Y/N)', 'Weight_gain(Y/N)', 
                       'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 
                       'Hair_loss(Y/N)', 'Pimples(Y/N)', 
                       'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 
                       'Blood_Group']

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

# Fill missing numeric values with median
df.fillna(df.median(), inplace=True)

In [7]:
# Display cleaned dataset
print(df.isnull().sum())

Sl._No                    0
Patient_File_No.          0
PCOS_(Y/N)                0
Age_(yrs)                 0
Weight_(Kg)               0
Height(Cm)                0
BMI                       0
Blood_Group               0
Pulse_rate(bpm)           0
RR_(breaths/min)          0
Hb(g/dl)                  0
Cycle(R/I)                0
Cycle_length(days)        0
Marraige_Status_(Yrs)     0
Pregnant(Y/N)             0
No._of_abortions          0
I___beta-HCG(mIU/mL)      0
II____beta-HCG(mIU/mL)    0
FSH(mIU/mL)               0
LH(mIU/mL)                0
FSH/LH                    0
Hip(inch)                 0
Waist(inch)               0
Waist:Hip_Ratio           0
TSH_(mIU/L)               0
AMH(ng/mL)                0
PRL(ng/mL)                0
Vit_D3_(ng/mL)            0
PRG(ng/mL)                0
RBS(mg/dl)                0
Weight_gain(Y/N)          0
hair_growth(Y/N)          0
Skin_darkening_(Y/N)      0
Hair_loss(Y/N)            0
Pimples(Y/N)              0
Fast_food_(Y/N)     

In [8]:
# Define the features and target variable for ovulatory disorder prediction
X = df[['Age_(yrs)', 'BMI', 'Cycle_length(days)', 'PCOS_(Y/N)', 'AMH(ng/mL)', 'PRG(ng/mL)', 'I___beta-HCG(mIU/mL)']]
y = df['PCOS_(Y/N)']  # Assuming PCOS_(Y/N) indicates ovulatory disorders

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [12]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Check the class distribution after SMOTE
from collections import Counter
print(f"Class distribution after SMOTE: {Counter(y_train_smote)}")


Class distribution after SMOTE: Counter({0: 291, 1: 291})


In [13]:
# Define a function to train and evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy


In [20]:
# Import necessary libraries
import optuna
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [36]:
# Assuming necessary imports have been done above
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import optuna

# Define the evaluation set (validation data)
eval_set = [(X_test_scaled, y_test)]

# Optuna hyperparameter tuning for XGBoost
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'learning_rate': trial.suggest_float('learning_rate', 1e-4, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10),
    }
    
    model = XGBClassifier(**param, random_state=42, use_label_encoder=False, eval_metric='logloss')
    model.fit(X_train_smote, y_train_smote, eval_set=eval_set, verbose=False)
    y_pred = model.predict(X_test_scaled)
    
    return accuracy_score(y_test, y_pred)

# Create the Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

# Get the best parameters
best_params = study.best_trial.params
print(f"Best XGBoost parameters: {best_params}")

# Train the XGBoost model with the best parameters
xgb_best = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_best.fit(X_train_smote, y_train_smote, eval_set=eval_set, verbose=False)

# Predict using the best model
y_pred_best = xgb_best.predict(X_test_scaled)

# Evaluate accuracy
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Tuned XGBoost Model Accuracy: {best_accuracy * 100:.2f}%")


[I 2024-10-13 03:45:59,706] A new study created in memory with name: no-name-4d1dd513-593b-44f6-bd57-a68e46e64921
Parameters: { "use_label_encoder" } are not used.

[I 2024-10-13 03:45:59,774] Trial 0 finished with value: 1.0 and parameters: {'max_depth': 8, 'learning_rate': 0.2888152159956615, 'n_estimators': 358, 'subsample': 0.6156590737601131, 'colsample_bytree': 0.735716418711611, 'reg_alpha': 0.17325603778057172, 'reg_lambda': 1.5358291184300898}. Best is trial 0 with value: 1.0.
Parameters: { "use_label_encoder" } are not used.

[I 2024-10-13 03:45:59,801] Trial 1 finished with value: 1.0 and parameters: {'max_depth': 5, 'learning_rate': 0.17796981675766627, 'n_estimators': 144, 'subsample': 0.9008887709462658, 'colsample_bytree': 0.6639377516417587, 'reg_alpha': 2.9817481171005373, 'reg_lambda': 3.8773646151858285}. Best is trial 0 with value: 1.0.
Parameters: { "use_label_encoder" } are not used.

[I 2024-10-13 03:45:59,905] Trial 2 finished with value: 1.0 and parameters: {'m

Best XGBoost parameters: {'max_depth': 8, 'learning_rate': 0.2888152159956615, 'n_estimators': 358, 'subsample': 0.6156590737601131, 'colsample_bytree': 0.735716418711611, 'reg_alpha': 0.17325603778057172, 'reg_lambda': 1.5358291184300898}
Tuned XGBoost Model Accuracy: 100.00%


Parameters: { "use_label_encoder" } are not used.



In [38]:
# Train XGBoost with the best parameters (without early_stopping_rounds)
xgb_best = XGBClassifier(**best_params, random_state=42, use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_best.fit(X_train_smote, y_train_smote, eval_set=[(X_test_scaled, y_test)], verbose=False)

# Evaluate the final model
y_pred_best = xgb_best.predict(X_test_scaled)

# Calculate and print the final accuracy
best_accuracy = accuracy_score(y_test, y_pred_best)
print(f"Tuned XGBoost Model Accuracy: {best_accuracy * 100:.2f}%")


Tuned XGBoost Model Accuracy: 100.00%


Parameters: { "use_label_encoder" } are not used.



In [39]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation on the final tuned XGBoost model
cv_scores = cross_val_score(xgb_best, X_train_smote, y_train_smote, cv=5)
print(f"Cross-Validation Accuracy: {np.mean(cv_scores) * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Cross-Validation Accuracy: 100.00%


In [42]:
# Display final results
print(f"Final XGBoost Model Accuracy: {best_accuracy * 100:.2f}%")
print(f"Final Cross-Validation Accuracy: {np.mean(cv_scores) * 100:.2f}%")


Final XGBoost Model Accuracy: 100.00%
Final Cross-Validation Accuracy: 100.00%


# SUPER SUS. NOT POSSIBLE. CHECK 
