In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import optuna


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Read the Dataset
df = pd.read_csv('../datasets/PCOS_data.csv')
df.head()

# Display basic information and the first few rows
print(df.info())
display(df.head())

# Generate descriptive statistics
display(df.describe())

# Check for missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541 entries, 0 to 540
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  541 non-null    int64  
 1   Patient File No.        541 non-null    int64  
 2   PCOS (Y/N)              541 non-null    int64  
 3    Age (yrs)              541 non-null    int64  
 4   Weight (Kg)             541 non-null    float64
 5   Height(Cm)              541 non-null    float64
 6   BMI                     541 non-null    float64
 7   Blood Group             541 non-null    int64  
 8   Pulse rate(bpm)         541 non-null    int64  
 9   RR (breaths/min)        541 non-null    int64  
 10  Hb(g/dl)                541 non-null    float64
 11  Cycle(R/I)              541 non-null    int64  
 12  Cycle length(days)      541 non-null    int64  
 13  Marraige Status (Yrs)   540 non-null    float64
 14  Pregnant(Y/N)           541 non-null    in

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,0,1.0,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0,0.0,0,120,70,3,5,15.0,14.0,3.7
2,3,3,1,33,68.8,165.0,25.3,11,72,18,...,1,1.0,0,120,80,13,15,18.0,20.0,10.0
3,4,4,0,37,65.0,148.0,29.7,13,72,20,...,0,0.0,0,120,70,2,2,15.0,14.0,7.5
4,5,5,0,25,52.0,161.0,20.1,11,72,18,...,0,0.0,0,120,80,3,4,16.0,14.0,7.0


Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
count,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,...,541.0,540.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0,541.0
mean,271.0,271.0,0.327172,31.430684,59.637153,156.484835,24.307579,13.802218,73.247689,19.243993,...,0.489834,0.514815,0.247689,114.661738,76.927911,6.12939,6.641405,15.018115,15.451701,8.475915
std,156.317519,156.317519,0.469615,5.411006,11.028287,6.033545,4.055129,1.840812,4.430285,1.688629,...,0.500359,0.500244,0.43207,7.384556,5.574112,4.229294,4.436889,3.566839,3.318848,2.165381
min,1.0,1.0,0.0,20.0,31.0,137.0,12.4,11.0,13.0,16.0,...,0.0,0.0,0.0,12.0,8.0,0.0,0.0,0.0,0.0,0.0
25%,136.0,136.0,0.0,28.0,52.0,152.0,21.6,13.0,72.0,18.0,...,0.0,0.0,0.0,110.0,70.0,3.0,3.0,13.0,13.0,7.0
50%,271.0,271.0,0.0,31.0,59.0,156.0,24.2,14.0,72.0,18.0,...,0.0,1.0,0.0,110.0,80.0,5.0,6.0,15.0,16.0,8.5
75%,406.0,406.0,1.0,35.0,65.0,160.0,26.6,15.0,74.0,20.0,...,1.0,1.0,0.0,120.0,80.0,9.0,10.0,18.0,18.0,9.8
max,541.0,541.0,1.0,48.0,108.0,180.0,38.9,18.0,82.0,28.0,...,1.0,1.0,1.0,140.0,100.0,22.0,20.0,24.0,24.0,18.0


Sl. No                    0
Patient File No.          0
PCOS (Y/N)                0
 Age (yrs)                0
Weight (Kg)               0
Height(Cm)                0
BMI                       0
Blood Group               0
Pulse rate(bpm)           0
RR (breaths/min)          0
Hb(g/dl)                  0
Cycle(R/I)                0
Cycle length(days)        0
Marraige Status (Yrs)     1
Pregnant(Y/N)             0
No. of abortions          0
  I   beta-HCG(mIU/mL)    0
II    beta-HCG(mIU/mL)    0
FSH(mIU/mL)               0
LH(mIU/mL)                0
FSH/LH                    0
Hip(inch)                 0
Waist(inch)               0
Waist:Hip Ratio           0
TSH (mIU/L)               0
AMH(ng/mL)                0
PRL(ng/mL)                0
Vit D3 (ng/mL)            0
PRG(ng/mL)                0
RBS(mg/dl)                0
Weight gain(Y/N)          0
hair growth(Y/N)          0
Skin darkening (Y/N)      0
Hair loss(Y/N)            0
Pimples(Y/N)              0
Fast food (Y/N)     

In [22]:
# Replace or remove all characters that might be problematic for LightGBM
df.columns = df.columns.str.replace(r'[\[\],:{}()\'"\s]+', '', regex=True)

In [23]:
# 5. Normalize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')
print(df.columns.tolist())

['Sl._No', 'Patient_File_No.', 'PCOS_Y/N', 'Age_yrs', 'Weight_Kg', 'HeightCm', 'BMI', 'Blood_Group', 'Pulse_ratebpm', 'RR_breaths/min', 'Hbg/dl', 'CycleR/I', 'Cycle_lengthdays', 'Marraige_Status_Yrs', 'PregnantY/N', 'No._of_abortions', 'I___beta-HCGmIU/mL', 'II____beta-HCGmIU/mL', 'FSHmIU/mL', 'LHmIU/mL', 'FSH/LH', 'Hipinch', 'Waistinch', 'WaistHip_Ratio', 'TSH_mIU/L', 'AMHng/mL', 'PRLng/mL', 'Vit_D3_ng/mL', 'PRGng/mL', 'RBSmg/dl', 'Weight_gainY/N', 'hair_growthY/N', 'Skin_darkening_Y/N', 'Hair_lossY/N', 'PimplesY/N', 'Fast_food_Y/N', 'Reg.ExerciseY/N', 'BP__Systolic_mmHg', 'BP__Diastolic_mmHg', 'Follicle_No._L', 'Follicle_No._R', 'Avg._F_size_L_mm', 'Avg._F_size_R_mm', 'Endometrium_mm']


In [4]:
# Correctly handle numeric columns with special characters and potential non-numeric data
numeric_columns = ['BMI', 'Age_(yrs)', 'Weight_(Kg)', 'Waist:Hip_Ratio', 
                   'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 
                   'FSH(mIU/mL)', 'LH(mIU/mL)', 'AMH(ng/mL)', 
                   'Cycle_length(days)', 'Endometrium_(mm)', 
                   'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)']

for col in numeric_columns:
    df[col] = pd.to_numeric(df[col].replace(r'^\.+$', '', regex=True), errors='coerce')
    df[col] = df[col].fillna(df[col].median())

# Handle categorical columns and fill missing values with mode
categorical_columns = ['PCOS_(Y/N)', 'Pregnant(Y/N)', 'Weight_gain(Y/N)', 
                       'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 
                       'Hair_loss(Y/N)', 'Pimples(Y/N)', 
                       'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 
                       'Blood_Group']

for col in categorical_columns:
    df[col] = df[col].fillna(df[col].mode()[0])

In [12]:
# 5. Normalize column names
df.columns = df.columns.str.strip().str.replace(' ', '_')
print(df.columns.tolist())

['Sl._No', 'Patient_File_No.', 'PCOS_(Y/N)', 'Age_(yrs)', 'Weight_(Kg)', 'Height(Cm)', 'BMI', 'Blood_Group', 'Pulse_rate(bpm)', 'RR_(breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle_length(days)', 'Marraige_Status_(Yrs)', 'Pregnant(Y/N)', 'No._of_abortions', 'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip_Ratio', 'TSH_(mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit_D3_(ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight_gain(Y/N)', 'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 'Hair_loss(Y/N)', 'Pimples(Y/N)', 'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)', 'Follicle_No._(L)', 'Follicle_No._(R)', 'Avg._F_size_(L)_(mm)', 'Avg._F_size_(R)_(mm)', 'Endometrium_(mm)']


In [5]:
# Encode categorical variables using LabelEncoder
labelencoder = LabelEncoder()
for column in categorical_columns:
    df[column] = labelencoder.fit_transform(df[column])

In [24]:
print(df.columns)


Index(['Sl._No', 'Patient_File_No.', 'PCOS_Y/N', 'Age_yrs', 'Weight_Kg',
       'HeightCm', 'BMI', 'Blood_Group', 'Pulse_ratebpm', 'RR_breaths/min',
       'Hbg/dl', 'CycleR/I', 'Cycle_lengthdays', 'Marraige_Status_Yrs',
       'PregnantY/N', 'No._of_abortions', 'I___beta-HCGmIU/mL',
       'II____beta-HCGmIU/mL', 'FSHmIU/mL', 'LHmIU/mL', 'FSH/LH', 'Hipinch',
       'Waistinch', 'WaistHip_Ratio', 'TSH_mIU/L', 'AMHng/mL', 'PRLng/mL',
       'Vit_D3_ng/mL', 'PRGng/mL', 'RBSmg/dl', 'Weight_gainY/N',
       'hair_growthY/N', 'Skin_darkening_Y/N', 'Hair_lossY/N', 'PimplesY/N',
       'Fast_food_Y/N', 'Reg.ExerciseY/N', 'BP__Systolic_mmHg',
       'BP__Diastolic_mmHg', 'Follicle_No._L', 'Follicle_No._R',
       'Avg._F_size_L_mm', 'Avg._F_size_R_mm', 'Endometrium_mm'],
      dtype='object')


In [27]:
features = ['Age_yrs', 'Weight_Kg', 'BMI', 'WaistHip_Ratio']  # Adjusted with leading spaces where necessary
target = 'Cycle_lengthdays'  # Ensure the target name is also correct


In [28]:
# Make sure column names are accessed correctly
X = df[features]
y = df[target]

In [29]:
# Proceed with the splitting and modeling
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [30]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),
    "CatBoost": CatBoostRegressor(verbose=0, random_state=42)
}

In [31]:
# Function to train and evaluate models
def train_evaluate(models, X_train, X_test, y_train, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}")

train_evaluate(models, X_train, X_test, y_train, y_test)

Linear Regression - MSE: 2.3163, R2: -0.0070
Random Forest - MSE: 2.9695, R2: -0.2909
XGBoost - MSE: 4.1759, R2: -0.8154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000052 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 181
[LightGBM] [Info] Number of data points in the train set: 378, number of used features: 4
[LightGBM] [Info] Start training from score 4.907407
LightGBM - MSE: 2.9489, R2: -0.2820
CatBoost - MSE: 3.1569, R2: -0.3724


In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv('../datasets/PCOS_data.csv')

# Normalize column names
df.columns = df.columns.str.strip().str.replace(' ', '_').str.replace(':', '')

# Identify numeric and categorical columns
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df.select_dtypes(exclude=[np.number]).columns.tolist()

# Fill missing values
df[numeric_columns] = df[numeric_columns].apply(lambda x: x.fillna(x.median()), axis=0)
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.fillna(x.mode().iloc[0]), axis=0)

# Encode categorical variables
for col in categorical_columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Feature scaling
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Split the data
X = df.drop('Cycle_length(days)', axis=1)  # Adjust based on your actual target column
y = df['Cycle_length(days)']  # Adjust based on your actual target column
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [35]:
print(df.columns.tolist())


['Sl._No', 'Patient_File_No.', 'PCOS_(Y/N)', 'Age_(yrs)', 'Weight_(Kg)', 'Height(Cm)', 'BMI', 'Blood_Group', 'Pulse_rate(bpm)', 'RR_(breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle_length(days)', 'Marraige_Status_(Yrs)', 'Pregnant(Y/N)', 'No._of_abortions', 'I___beta-HCG(mIU/mL)', 'II____beta-HCG(mIU/mL)', 'FSH(mIU/mL)', 'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'WaistHip_Ratio', 'TSH_(mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit_D3_(ng/mL)', 'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight_gain(Y/N)', 'hair_growth(Y/N)', 'Skin_darkening_(Y/N)', 'Hair_loss(Y/N)', 'Pimples(Y/N)', 'Fast_food_(Y/N)', 'Reg.Exercise(Y/N)', 'BP__Systolic_(mmHg)', 'BP__Diastolic_(mmHg)', 'Follicle_No._(L)', 'Follicle_No._(R)', 'Avg._F_size_(L)_(mm)', 'Avg._F_size_(R)_(mm)', 'Endometrium_(mm)']


In [36]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Initialize models
models = {
    'XGBRegressor': XGBRegressor(random_state=42),
    'LGBMRegressor': LGBMRegressor(random_state=42),
    'CatBoostRegressor': CatBoostRegressor(random_state=42, verbose=0)
}

# Function to train and evaluate models
def train_evaluate(models, X_train, X_test, y_train, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        print(f"{name} - MSE: {mse:.4f}, R2: {r2:.4f}")

train_evaluate(models, X_train, X_test, y_train, y_test)


XGBRegressor - MSE: 1.2537, R2: -0.1288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2032
[LightGBM] [Info] Number of data points in the train set: 432, number of used features: 43
[LightGBM] [Info] Start training from score -0.011565
LGBMRegressor - MSE: 1.1918, R2: -0.0731
CatBoostRegressor - MSE: 0.9690, R2: 0.1275


In [37]:
import optuna

def objective(trial):
    model = XGBRegressor(
        n_estimators=trial.suggest_int('n_estimators', 100, 1000),
        max_depth=trial.suggest_int('max_depth', 3, 20),
        learning_rate=trial.suggest_uniform('learning_rate', 0.01, 0.3),
        subsample=trial.suggest_uniform('subsample', 0.5, 1.0),
        colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        random_state=42
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, preds))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Best trial:', study.best_trial.params)


[I 2024-10-12 23:52:09,212] A new study created in memory with name: no-name-3ec34e68-b4b4-48de-ae82-5eb03225c0f6
  learning_rate=trial.suggest_uniform('learning_rate', 0.01, 0.3),
  subsample=trial.suggest_uniform('subsample', 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
[I 2024-10-12 23:52:09,352] Trial 0 finished with value: 1.1278315093517435 and parameters: {'n_estimators': 288, 'max_depth': 8, 'learning_rate': 0.22807850203785465, 'subsample': 0.9535265210134225, 'colsample_bytree': 0.7288354148262012}. Best is trial 0 with value: 1.1278315093517435.
  learning_rate=trial.suggest_uniform('learning_rate', 0.01, 0.3),
  subsample=trial.suggest_uniform('subsample', 0.5, 1.0),
  colsample_bytree=trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
[I 2024-10-12 23:52:09,591] Trial 1 finished with value: 1.1783971208121697 and parameters: {'n_estimators': 468, 'max_depth': 10, 'learning_rate': 0.2313522507726218, 'subsample': 0.547516569062942, 'c

Best trial: {'n_estimators': 762, 'max_depth': 6, 'learning_rate': 0.21474743269923785, 'subsample': 0.8902021509852228, 'colsample_bytree': 0.5358968130806232}


In [38]:
best_params = study.best_trial.params
model = XGBRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f"Optimized XGBRegressor - MSE: {mse:.4f}, R2: {r2:.4f}")


Optimized XGBRegressor - MSE: 0.8403, R2: 0.2434
