***Importing Libraries***

In [174]:
import pandas as pd
import numpy as  np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

***Reading Dataset***

In [171]:
df1=pd.read_csv("pcos_dataset.csv")
df=df1.copy()
df.head()

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count,PCOS_Diagnosis
0,24,34.7,1,25.2,20,0
1,37,26.4,0,57.1,25,0
2,32,23.6,0,92.7,28,0
3,28,28.8,0,63.1,26,0
4,25,22.1,1,59.8,8,0


***Exploratory Data Analysis***

In [179]:
df.isnull().sum()

Age                          0
BMI                          0
Menstrual_Irregularity       0
Testosterone_Level(ng/dL)    0
Antral_Follicle_Count        0
PCOS_Diagnosis               0
dtype: int64

In [204]:
plt.figure(figsize=(8, 5))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlation Heatmap")
plt.show()

In [206]:
df.hist(figsize=(10, 6), bins=10, edgecolor='black')
plt.suptitle("Feature Distributions")
plt.show()

In [208]:
plt.figure(figsize=(6, 4))
sns.boxplot(x=df['PCOS_Diagnosis'], y=df['Testosterone_Level(ng/dL)'])
plt.title("Testosterone Levels vs PCOS Diagnosis")
plt.xlabel("PCOS Diagnosis (0: No, 1: Yes)")
plt.ylabel("Testosterone Level (ng/dL)")
plt.show()

***Checking if data is balanced***

In [211]:
df["PCOS_Diagnosis"].value_counts()

PCOS_Diagnosis
0    801
1    199
Name: count, dtype: int64

In [213]:
smote= SMOTE(random_state=42)

In [215]:
X=df.drop(columns=["PCOS_Diagnosis"],axis=1)
y=df["PCOS_Diagnosis"]

In [217]:
X_resampled,y_resampled = smote.fit_resample(X,y)
y_resampled.value_counts()

PCOS_Diagnosis
0    801
1    801
Name: count, dtype: int64

***Splitting our dataset***

In [220]:
X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,test_size=0.3, random_state=42)

***Scalling our data***

In [223]:
scaler = StandardScaler()

In [225]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

***Testing Multiple Classification Models***

In [228]:
# Fit and test multiple models
models = {
    "Logistic Regression": LogisticRegression(),
    "Support Vector Classifier": SVC(probability=True),
    "Random Forest Classifier": RandomForestClassifier(),
    "K Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree Classifier": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
}



# Loop through models
for name, model in models.items():
    print("=" * 50)
    print("Model:", name)
    
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_probs = model.predict_proba(X_test_scaled)[:, 1]  # For ROC-AUC

    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_probs)
    
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"ROC-AUC: {auc:.4f}")
    print("Classification Report:\n", classification_rep)
    print("Confusion Matrix:\n", conf_matrix)
    print("=" * 50)

Model: Logistic Regression
Accuracy: 0.9044
F1-score: 0.9073
ROC-AUC: 0.9659
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.86      0.90       243
           1       0.87      0.95      0.91       238

    accuracy                           0.90       481
   macro avg       0.91      0.90      0.90       481
weighted avg       0.91      0.90      0.90       481

Confusion Matrix:
 [[210  33]
 [ 13 225]]
Model: Support Vector Classifier
Accuracy: 0.9751
F1-score: 0.9753
ROC-AUC: 0.9983
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97       243
           1       0.96      1.00      0.98       238

    accuracy                           0.98       481
   macro avg       0.98      0.98      0.98       481
weighted avg       0.98      0.98      0.98       481

Confusion Matrix:
 [[232  11]
 [  1 237]]
Model: Random Forest Classifier
Accuracy: 0.9917
F1-sc

***Checking Models Results***

In [231]:
results = []

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_probs = model.predict_proba(X_test_scaled)[:, 1]

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC-AUC": roc_auc_score(y_test, y_probs)
    })

results_df = pd.DataFrame(results).sort_values(by="ROC-AUC", ascending=False)
print(results_df)


                          Model  Accuracy  F1-Score   ROC-AUC
2      Random Forest Classifier  0.991684  0.991667  0.999101
1     Support Vector Classifier  0.975052  0.975309  0.998340
6           AdaBoost Classifier  0.993763  0.993737  0.993827
7  Gradient Boosting Classifier  0.991684  0.991667  0.993827
3           K Nearest Neighbors  0.958420  0.959350  0.992046
4      Decision Tree Classifier  0.991684  0.991667  0.991770
0           Logistic Regression  0.904366  0.907258  0.965937
8            XGBoost Classifier  0.505198  0.000000  0.920289
5          Gaussian Naive Bayes  0.505198  0.000000  0.500000


***Choosing Best Perfoming Model***

In [234]:
best_model_name = results_df.iloc[0]["Model"]
best_model = models[best_model_name]
print(f"Best Model Selected: {best_model_name}")

Best Model Selected: Random Forest Classifier


***Refitting our data on a chosen best model***

In [237]:
best_model.fit(X_train_scaled, y_train)

In [241]:
y_pred = best_model.predict(X_test_scaled)
y_probs = best_model.predict_proba(X_test_scaled)[:, 1]

print(f"Final Model - {best_model_name}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_probs):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Final Model - Random Forest Classifier
Accuracy: 0.9938
F1-score: 0.9937
ROC-AUC: 0.9990
Confusion Matrix:
 [[240   3]
 [  0 238]]


In [245]:
# # Select a few random indices from the test dataset

indices = np.random.choice(len(X_test_scaled), 10, replace=False)  # Get 10 random test samples

# Predict labels for these instances
predicted_labels = best_model.predict(X_test_scaled[indices])
actual_labels = y_test.iloc[indices].values  # Ensure y_test is indexed correctly

# Print results
for i, idx in enumerate(indices):
    print(f"Sample {i+1}:")
    print(f"Predicted: {predicted_labels[i]} | Actual: {actual_labels[i]}")
    print("-" * 30)

Sample 1:
Predicted: 0 | Actual: 0
------------------------------
Sample 2:
Predicted: 0 | Actual: 0
------------------------------
Sample 3:
Predicted: 1 | Actual: 1
------------------------------
Sample 4:
Predicted: 0 | Actual: 0
------------------------------
Sample 5:
Predicted: 0 | Actual: 0
------------------------------
Sample 6:
Predicted: 0 | Actual: 0
------------------------------
Sample 7:
Predicted: 1 | Actual: 1
------------------------------
Sample 8:
Predicted: 0 | Actual: 0
------------------------------
Sample 9:
Predicted: 1 | Actual: 1
------------------------------
Sample 10:
Predicted: 1 | Actual: 1
------------------------------


In [247]:
print(y_test.value_counts())

PCOS_Diagnosis
0    243
1    238
Name: count, dtype: int64


***Saving Our Model***

In [252]:
with open("pcos_model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

# Save the scaler
with open("scaler.pkl", "wb") as scaler_file:
    pickle.dump(scaler, scaler_file)


# Load the trained model and scaler
with open("pcos_model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

with open("scaler.pkl", "rb") as scaler_file:
    scaler = pickle.load(scaler_file)