In [33]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [34]:
# Loading dataset

df = pd.read_csv('/home/momtahin/Documents/Project: PCOS/data/train.csv')

In [35]:
df.info()

df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ID                     210 non-null    int64  
 1   Age                    209 non-null    object 
 2   Weight_kg              208 non-null    float64
 3   PCOS                   210 non-null    object 
 4   Hormonal_Imbalance     208 non-null    object 
 5   Hyperandrogenism       207 non-null    object 
 6   Hirsutism              205 non-null    object 
 7   Conception_Difficulty  209 non-null    object 
 8   Insulin_Resistance     209 non-null    object 
 9   Exercise_Frequency     208 non-null    object 
 10  Exercise_Type          208 non-null    object 
 11  Exercise_Duration      208 non-null    object 
 12  Sleep_Hours            208 non-null    object 
 13  Exercise_Benefit       209 non-null    object 
dtypes: float64(1), int64(1), object(12)
memory usage: 23.1+ KB

Unnamed: 0,ID,Weight_kg
count,210.0,208.0
mean,104.5,56.159615
std,60.765944,12.572768
min,0.0,20.0
25%,52.25,48.0
50%,104.5,55.0
75%,156.75,64.0
max,209.0,116.0


In [36]:
# import dtale

# d = dtale.show(df)
# d.open_browser()


# # Analysis of the dataset:
# # Features which are useful for the model based upon
# #   1. Correlation analysis
# #       1. |r| > 0.8 → Remove one of the correlated features (redundant).  
# #       2. 0.3 ≤ |r| ≤ 0.8 → Keep if useful for the model.  
# #       3. |r| < 0.3 → Likely not useful, consider removing.
# #   2. Class imbalance
# #   3. How well the features are defined

# # Selected features:
# #     1. Weight_kg
# #     2. Menstrual_Irregularity
# #     3. Hormonal_Imbalance
# #     4. Hyperandrogenism
# #     5. Hirsutism
# #     6. Stress_Level


In [37]:
df_selected_features = df[['Weight_kg', 'Hormonal_Imbalance', 'Hyperandrogenism', 'Hirsutism', 'PCOS']]

df_selected_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Weight_kg           208 non-null    float64
 1   Hormonal_Imbalance  208 non-null    object 
 2   Hyperandrogenism    207 non-null    object 
 3   Hirsutism           205 non-null    object 
 4   PCOS                210 non-null    object 
dtypes: float64(1), object(4)
memory usage: 8.3+ KB


In [38]:
# d = dtale.show(df_selected_features)
# d.open_browser()

In [39]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df_selected_features.drop('PCOS', axis=1), df_selected_features['PCOS'], test_size=0.15, random_state=42)


In [40]:

# Preprocessing

numeric_features=['Weight_kg']
categorical_features=['Hormonal_Imbalance', 'Hyperandrogenism', 'Hirsutism']



numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
])

# Model for probability prediction
rf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Fit the model
rf.fit(X_train, y_train)

In [43]:
# Predictions
y_pred = rf.predict(X_test)

# Model Evaluation
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

# Cross Validation
cv_score = cross_val_score(rf, X_train, y_train, cv=5)
print('Cross Validation Score:', cv_score.mean())

Accuracy: 0.78125
Confusion Matrix:
 [[18  6]
 [ 1  7]]
Cross Validation Score: 0.8033333333333333


In [51]:
# creating a submission csv file for the test data set

df_test = pd.read_csv('/home/momtahin/Documents/Project: PCOS/data/test.csv')

predictions = rf.predict_proba(df_test)

submission = pd.DataFrame({"ID": df_test["ID"], "PCOS": predictions[:,1]})  
submission.to_csv("submission.csv", index=False)

array([0.        , 0.9       , 0.78      , 0.12      , 0.7       ,
       0.3377381 , 0.        , 0.9       , 0.9       , 0.        ,
       0.03578571, 0.3377381 , 0.18      , 0.3377381 , 0.18      ,
       0.        , 0.        , 0.        , 0.        , 0.04      ,
       0.18      , 0.        , 0.64      , 0.01      , 0.        ,
       0.03      , 0.47      , 0.3377381 , 0.        , 0.01      ,
       0.03861905, 0.        , 0.        , 0.03578571, 0.        ,
       0.18      , 0.95      , 0.9       , 0.3377381 , 0.        ,
       0.02      , 0.        , 0.02      , 0.        , 0.        ,
       0.93      , 0.34      , 0.96      , 0.9       , 0.63      ,
       0.03      , 0.13      , 0.        , 0.92      , 0.01      ,
       0.93      , 0.71      , 0.97      , 0.68      , 0.        ,
       0.01      , 0.        , 0.38      , 0.        , 0.        ,
       0.12      , 0.        , 0.91      , 0.009     , 0.        ,
       0.33      , 0.52      , 0.        , 0.01      , 0.     