# 1.2 Construção do Modelo

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

### 1.2.1 Abrindo o dataset e tratando os dados

In [76]:
data = pd.read_csv('data/PCOS_data.csv')

In [77]:
data.head(2)

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm),Unnamed: 44
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,1.0,0,110,80,3,3,18.0,18.0,8.5,
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0.0,0,120,70,3,5,15.0,14.0,3.7,


In [78]:
del data["Unnamed: 44"]

In [79]:
data.head(2)

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
0,1,1,0,28,44.6,152.0,19.3,15,78,22,...,0,1.0,0,110,80,3,3,18.0,18.0,8.5
1,2,2,0,36,65.0,161.5,24.9,15,74,20,...,0,0.0,0,120,70,3,5,15.0,14.0,3.7


In [80]:
data.dropna(inplace=True)

In [81]:
data = data.replace({'1.99.': 1.99})

In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 539 entries, 0 to 540
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  539 non-null    int64  
 1   Patient File No.        539 non-null    int64  
 2   PCOS (Y/N)              539 non-null    int64  
 3    Age (yrs)              539 non-null    int64  
 4   Weight (Kg)             539 non-null    float64
 5   Height(Cm)              539 non-null    float64
 6   BMI                     539 non-null    float64
 7   Blood Group             539 non-null    int64  
 8   Pulse rate(bpm)         539 non-null    int64  
 9   RR (breaths/min)        539 non-null    int64  
 10  Hb(g/dl)                539 non-null    float64
 11  Cycle(R/I)              539 non-null    int64  
 12  Cycle length(days)      539 non-null    int64  
 13  Marraige Status (Yrs)   539 non-null    float64
 14  Pregnant(Y/N)           539 non-null    in

In [83]:
data[data['AMH(ng/mL)'] == "a"]

Unnamed: 0,Sl. No,Patient File No.,PCOS (Y/N),Age (yrs),Weight (Kg),Height(Cm),BMI,Blood Group,Pulse rate(bpm),RR (breaths/min),...,Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm)
305,306,306,0,37,56.0,152.0,24.2,13,74,20,...,1,0.0,1,120,70,4,5,17.0,16.0,5.6


In [84]:
data.drop([305], inplace=True)

In [85]:
data['II    beta-HCG(mIU/mL)'] = pd.to_numeric(data['II    beta-HCG(mIU/mL)'])

In [86]:
data['AMH(ng/mL)'] = pd.to_numeric(data['AMH(ng/mL)'])

In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 538 entries, 0 to 540
Data columns (total 44 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Sl. No                  538 non-null    int64  
 1   Patient File No.        538 non-null    int64  
 2   PCOS (Y/N)              538 non-null    int64  
 3    Age (yrs)              538 non-null    int64  
 4   Weight (Kg)             538 non-null    float64
 5   Height(Cm)              538 non-null    float64
 6   BMI                     538 non-null    float64
 7   Blood Group             538 non-null    int64  
 8   Pulse rate(bpm)         538 non-null    int64  
 9   RR (breaths/min)        538 non-null    int64  
 10  Hb(g/dl)                538 non-null    float64
 11  Cycle(R/I)              538 non-null    int64  
 12  Cycle length(days)      538 non-null    int64  
 13  Marraige Status (Yrs)   538 non-null    float64
 14  Pregnant(Y/N)           538 non-null    in

### Usando a correlação de Pearson para encontrar os melhores atributos preditores

In [88]:
X = data.drop(columns=['PCOS (Y/N)'], axis=1)
y = data["PCOS (Y/N)"]

In [91]:
correlations = []
for col in X.columns:
    corr, _ = pearsonr(X[col], y)
    correlations.append(corr)

# criar um dicionário de features e seus coeficientes de correlação
corr_dict = dict(zip(X.columns, correlations))

# ordena o dicionário por coeficiente de correlação em ordem decrescente
sorted_corr_dict = {k: v for k, v in sorted(corr_dict.items(), key=lambda item: item[1], reverse=True)}

N = 20
selected_features = list(sorted_corr_dict.keys())[:N]

# use the selected features for machine learning modeling
X_selected = X[selected_features]

In [92]:
X_selected

Unnamed: 0,Follicle No. (R),Follicle No. (L),Skin darkening (Y/N),hair growth(Y/N),Weight gain(Y/N),Cycle(R/I),Fast food (Y/N),Pimples(Y/N),AMH(ng/mL),Weight (Kg),BMI,Hair loss(Y/N),Waist(inch),Hip(inch),Avg. F size (L) (mm),Endometrium (mm),Avg. F size (R) (mm),Pulse rate(bpm),Hb(g/dl),Vit D3 (ng/mL)
0,3,3,0,0,0,2,1.0,0,2.07,44.6,19.3,0,30,36,18.0,8.5,18.0,78,10.48,17.1
1,5,3,0,0,0,2,0.0,0,1.53,65.0,24.9,0,32,38,15.0,3.7,14.0,74,11.70,61.3
2,15,13,0,0,0,2,1.0,1,6.63,68.8,25.3,1,36,40,18.0,10.0,20.0,72,11.80,49.7
3,2,2,0,0,0,2,0.0,0,1.22,65.0,29.7,0,36,42,15.0,7.5,14.0,72,12.00,33.4
4,4,3,0,0,0,2,0.0,0,2.26,52.0,20.1,1,30,37,16.0,7.0,14.0,72,10.00,43.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
536,0,1,0,0,0,2,0.0,0,1.70,50.0,18.5,0,26,28,17.5,6.7,10.0,72,11.00,36.6
537,7,9,0,0,1,2,0.0,0,5.60,63.2,25.3,0,32,34,19.0,8.2,18.0,72,10.80,23.0
538,0,1,0,0,0,2,0.0,0,3.70,54.0,23.4,0,28,30,18.0,7.3,9.0,74,10.80,22.5
539,6,7,0,0,0,4,0.0,1,5.20,50.0,22.2,0,26,28,18.0,11.5,16.0,74,12.00,22.4
