In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [8]:
data = pd.read_csv('PCOS_data.csv')


Step 1: Data Preprocessing


In [9]:
# Identify numeric and categorical columns based on dtype
numeric_features = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = data.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

In [12]:
# Remove the target variable from features
if 'PCOS (Y/N)' in numeric_features:
    numeric_features.remove('PCOS (Y/N)')

# Pipeline for numerical features
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [17]:
# Pipeline for categorical features
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


In [18]:
# Combine pipelines
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numeric_features),
        ('cat', categorical_pipeline, categorical_features)
    ])


In [19]:
# Prepare the features and target variable
X = data.drop('PCOS (Y/N)', axis=1)
y = data['PCOS (Y/N)']

In [20]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Fit the preprocessor and transform the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


In [22]:
# Checking for any non-numeric data in numeric columns
for col in numeric_features:
    if data[col].dtype == 'object':
        print(f"Non-numeric data found in {col}: ", data[col].unique())


Step 1: Identify Hormone-Related Features
From your dataset, the hormone-related features include:

- FSH (mIU/mL): Follicle-stimulating hormone
- LH (mIU/mL): Luteinizing hormone
- FSH/LH Ratio: Ratio of FSH to LH, which is often used as an indicator in diagnosing PCOS
- PRL (ng/mL): Prolactin
- TSH (mIU/L): Thyroid-stimulating hormone (indirectly related but can impact overall hormonal balance)
- AMH (ng/mL): Anti-Müllerian hormone, which is often elevated in PCOS

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

# Re-define the preprocessing pipeline
hormone_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Impute with median
    ('scaler', StandardScaler())  # Scale the features
])

# Re-create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('hormones', hormone_pipeline, hormone_features)
    ])

# Reapply transformations
X_hormones = preprocessor.fit_transform(data)


In [24]:
# Print actual column names to check against hormone_features
print(data.columns)


Index(['Sl. No', 'Patient File No.', 'PCOS (Y/N)', ' Age (yrs)', 'Weight (Kg)',
       'Height(Cm) ', 'BMI', 'Blood Group', 'Pulse rate(bpm) ',
       'RR (breaths/min)', 'Hb(g/dl)', 'Cycle(R/I)', 'Cycle length(days)',
       'Marraige Status (Yrs)', 'Pregnant(Y/N)', 'No. of abortions',
       '  I   beta-HCG(mIU/mL)', 'II    beta-HCG(mIU/mL)', 'FSH(mIU/mL)',
       'LH(mIU/mL)', 'FSH/LH', 'Hip(inch)', 'Waist(inch)', 'Waist:Hip Ratio',
       'TSH (mIU/L)', 'AMH(ng/mL)', 'PRL(ng/mL)', 'Vit D3 (ng/mL)',
       'PRG(ng/mL)', 'RBS(mg/dl)', 'Weight gain(Y/N)', 'hair growth(Y/N)',
       'Skin darkening (Y/N)', 'Hair loss(Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Reg.Exercise(Y/N)', 'BP _Systolic (mmHg)',
       'BP _Diastolic (mmHg)', 'Follicle No. (L)', 'Follicle No. (R)',
       'Avg. F size (L) (mm)', 'Avg. F size (R) (mm)', 'Endometrium (mm)'],
      dtype='object')


In [25]:
# Check for non-numeric entries in the hormone-related columns
for col in hormone_features:
    if data[col].dtype == 'object':
        print(f"Non-numeric values found in {col}: ", data[col].unique())


Non-numeric values found in AMH(ng/mL):  ['2.07' '1.53' '6.63' '1.22' '2.26' '6.74' '3.05' '1.54' '1' '1.61' '4.47'
 '1.67' '7.94' '2.38' '0.88' '0.69' '3.78' '1.92' '2.85' '2.13' '4.13'
 '2.5' '1.89' '0.26' '3.84' '3.56' '1.56' '1.69' '2.34' '1.58' '2.36'
 '3.64' '2.78' '0.33' '2.35' '3.88' '3.55' '4.33' '3.66' '4.5' '3.2' '2.1'
 '6.55' '1.2' '2.33' '3.22' '2.333' '2.31' '4.2' '3.21' '2.14' '2.3' '4.6'
 '5.8' '5.2' '4.63' '1.01' '2.58' '0.35' '5.23' '3.68' '2.55' '4.91'
 '1.03' '6.56' '3.91' '5.42' '1.65' '2.06' '1.81' '3.81' '3.65' '8.98'
 '1.7' '3.18' '2.75' '0.86' '2.29' '2.19' '8.46' '4.59' '1.04' '4.27'
 '3.86' '1.42' '10.07' '0.98' '4.07' '3.9' '10' '16.9' '17' '21.9' '1.6'
 '3.3' '21' '12.7' '1.8' '3.6' '15' '5' '17.9' '19.8' '9.2' '2.4' '5.14'
 '0.3' '11.48' '19.3' '8.8' '19' '4.3' '1.4' '12.6' '4.8' '17.1' '11.6'
 '18.4' '9.9' '3.7' '2.9' '2' '4' '15.9' '7.51' '10.04' '6.86' '7.02'
 '8.75' '5.27' '9' '3.41' '0.45' '2.53' '0.29' '2.6' '2.83' '2.01' '5.67'
 '1.68' '3.63' '3.49'

In [26]:
# Convert non-numeric entries to NaN
for col in hormone_features:
    data[col] = pd.to_numeric(data[col], errors='coerce')


In [29]:
from sklearn.linear_model import LogisticRegression

# Initialize the model with class weight adjustment
model = LogisticRegression(class_weight='balanced')  # This adjusts weights inversely proportional to class frequencies

# Fit the model on the training data
model.fit(X_train_preprocessed, y_train)

# Evaluate the model as before


model building


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_hormones, data['PCOS (Y/N)'], test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.92      0.81        77
           1       0.40      0.12      0.19        32

    accuracy                           0.69       109
   macro avg       0.56      0.52      0.50       109
weighted avg       0.62      0.69      0.63       109

Confusion Matrix:
 [[71  6]
 [28  4]]
ROC AUC Score: 0.5848214285714286
