In [1]:
import pandas as pd

DATA = '/kaggle/input/pcos-diagnosis-dataset/pcos_dataset.csv'
df = pd.read_csv(filepath_or_buffer=DATA)
df['PCOS'] = df['PCOS_Diagnosis'] == 1
df.head()

Unnamed: 0,Age,BMI,Menstrual_Irregularity,Testosterone_Level(ng/dL),Antral_Follicle_Count,PCOS_Diagnosis,PCOS
0,24,34.7,1,25.2,20,0,False
1,37,26.4,0,57.1,25,0,False
2,32,23.6,0,92.7,28,0,False
3,28,28.8,0,63.1,26,0,False
4,25,22.1,1,59.8,8,0,False


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Age                        1000 non-null   int64  
 1   BMI                        1000 non-null   float64
 2   Menstrual_Irregularity     1000 non-null   int64  
 3   Testosterone_Level(ng/dL)  1000 non-null   float64
 4   Antral_Follicle_Count      1000 non-null   int64  
 5   PCOS_Diagnosis             1000 non-null   int64  
 6   PCOS                       1000 non-null   bool   
dtypes: bool(1), float64(2), int64(4)
memory usage: 48.0 KB


We have exactly 1000 rows and our data is all numerical. Any chance this is synthetic data? Let's find out.

In [3]:
from sklearn.manifold import TSNE

COLUMNS = ['Age', 'BMI', 'Menstrual_Irregularity', 'Testosterone_Level(ng/dL)', 'Antral_Follicle_Count',]
TARGET = 'PCOS'
RANDOM_STATE = 2025
reducer = TSNE(random_state=RANDOM_STATE)
tsne_df = pd.DataFrame(columns=['x', 'y'], data=reducer.fit_transform(X=df[COLUMNS]))
tsne_df[TARGET] = df[TARGET].to_list()

In [4]:
from plotly import express
from plotly import io

io.renderers.default = 'iframe'
express.scatter(data_frame=tsne_df, x='x', y='y', color=TARGET)

We do see some local clustering, so maybe there's a signal in our data. Let's build a model and see what happens.

In [5]:
df[TARGET].value_counts().to_dict()

{False: 801, True: 199}

Our classes are unbalanced, so let's use a sample to balance them.

In [6]:
sample_df = pd.concat(axis='index', objs=[df[df[TARGET] == True], df[df[TARGET]== False].sample(n=200, random_state=RANDOM_STATE)])
sample_df.shape

(399, 7)

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sample_df[COLUMNS], sample_df[TARGET], test_size=0.2, random_state=2024, stratify=sample_df[TARGET])
logreg = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test))))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test)))

model fit in 43 iterations
accuracy: 0.9500
f1: 0.9500
              precision    recall  f1-score   support

       False       0.95      0.95      0.95        40
        True       0.95      0.95      0.95        40

    accuracy                           0.95        80
   macro avg       0.95      0.95      0.95        80
weighted avg       0.95      0.95      0.95        80

