In [1]:
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from typing import Counter
import pandas as pd
import numpy as np

In [19]:
file_path = './dataset/PCOS_data.csv'
data = pd.read_csv(file_path)

#data.info(), data.head(), data.describe()

In [5]:
data_cleaned = data.replace(',', '.', regex=True)

# convertendo colunas para os tipos numéricos apropriados
for col in data_cleaned.columns:
  try:
    data_cleaned[col] = pd.to_numeric(data_cleaned[col])
  except ValueError:
    pass

# verificando e remover duplicatas
duplicates = data_cleaned.duplicated().sum()
data_cleaned = data_cleaned.drop_duplicates()

# verificando valores de cada classe para balanceamento
class_distribution = data_cleaned['PCOS (Y/N)'].value_counts()

duplicates, class_distribution

(0,
 PCOS (Y/N)
 0    364
 1    177
 Name: count, dtype: int64)

In [7]:
# separando características (X) e rótulos (y)
X = data_cleaned.drop(columns=['PCOS (Y/N)'])
y = data_cleaned['PCOS (Y/N)']

# balanceamento dos dados com SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

print(f"Distribuição das classes após SMOTE: {Counter(y_balanced)}")

# dividindo os dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.3, random_state=42)

# normalizando os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Distribuição das classes após SMOTE: Counter({0: 364, 1: 364})


In [23]:
# Obtendo a importâncias dos atributos 
importances = decision_tree.feature_importances_ 

# Criando um DataFrame para visualizar as importâncias 
feature_importances = pd.DataFrame({ 'Feature': X.columns, 'Importance': importances })

# Ordenando as importâncias em ordem decrescente 
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

top10_relevant_features = feature_importances.head(10)

print("Importâncias dos Atributos:") 
print(top10_relevant_features)

Importâncias dos Atributos:
                 Feature  Importance
37      Follicle No. (R)    0.502088
22            AMH(ng/mL)    0.061635
36      Follicle No. (L)    0.057572
2            Height(Cm)     0.037839
38  Avg. F size (L) (mm)    0.032476
15           FSH(mIU/mL)    0.031943
21           TSH (mIU/L)    0.031547
20       Waist:Hip Ratio    0.028826
28      hair growth(Y/N)    0.027522
27      Weight gain(Y/N)    0.025549


In [9]:
# treinando o modelo
decision_tree = DecisionTreeClassifier(random_state=42) 
decision_tree.fit(X_train_scaled, y_train)

y_pred_all_features_decision_tree = decision_tree.predict(X_test_scaled)
cv_scores_all_features_decision_tree = cross_val_score(decision_tree, X_train_scaled, y_train, cv=5, scoring='accuracy')
mean_accuracy_all_decision_tree = np.mean(cv_scores_all_features_decision_tree)
std_accuracy_all_decision_tree = np.std(cv_scores_all_features_decision_tree)

print("Relatório de Classificação com todas as características (Decision Tree):\n", classification_report(y_test, y_pred_all_features_decision_tree))
print("Matriz de Confusão com todas as características (Decision Tree):\n", confusion_matrix(y_test, y_pred_all_features_decision_tree))
print(f"Validação cruzada com todas as características (Decision Tree):\nAcurácia média: {mean_accuracy_all_decision_tree:.4f}, Desvio padrão: {std_accuracy_all_decision_tree:.4f}")

Relatório de Classificação com todas as características (decision_tree):
               precision    recall  f1-score   support

           0       0.79      0.85      0.82       110
           1       0.83      0.77      0.80       109

    accuracy                           0.81       219
   macro avg       0.81      0.81      0.81       219
weighted avg       0.81      0.81      0.81       219

Matriz de Confusão com todas as características (decision_tree):
 [[93 17]
 [25 84]]
Validação cruzada com todas as características (decision_tree):
Acurácia média: 0.8369, Desvio padrão: 0.0546


### Treinando com as K Melhores

In [25]:
# seleção de características com SelectKBest
selector = SelectKBest(score_func=f_classif, k=10)
X_new = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]

print("Características selecionadas:", selected_features)

Características selecionadas: Index(['Weight (Kg)', 'Cycle(R/I)', 'AMH(ng/mL)', 'Weight gain(Y/N)',
       'hair growth(Y/N)', 'Skin darkening (Y/N)', 'Pimples(Y/N)',
       'Fast food (Y/N)', 'Follicle No. (L)', 'Follicle No. (R)'],
      dtype='object')


In [27]:
# balanceamento dos dados com SMOTE para as características selecionadas
X_balanced_selected, y_balanced_selected = smote.fit_resample(X_new, y)
print(f"Distribuição das classes após SMOTE: {Counter(y_balanced)}")

# divisão em treino e teste para as características selecionadas
X_train_selected, X_test_selected, y_train_selected, y_test_selected = train_test_split(X_balanced_selected, y_balanced_selected, test_size=0.3, random_state=42)

# normalizando os dados
X_train_scaled_selected = scaler.fit_transform(X_train_selected)
X_test_scaled_selected = scaler.transform(X_test_selected)

Distribuição das classes após SMOTE: Counter({0: 364, 1: 364})


In [29]:
decision_tree.fit(X_train_scaled_selected, y_train_selected)
y_pred_selected_decision_tree = decision_tree.predict(X_test_scaled_selected)

cv_scores_selected_features_decision_tree = cross_val_score(decision_tree, X_train_scaled_selected, y_train_selected, cv=5, scoring='accuracy')
mean_accuracy_selected_decision_tree = np.mean(cv_scores_selected_features_decision_tree)
std_accuracy_selected_decision_tree = np.std(cv_scores_selected_features_decision_tree)

print("Relatório de Classificação com as melhores características (Decision Tree):\n", classification_report(y_test_selected, y_pred_selected_decision_tree))
print("Matriz de Confusão com as melhores características (Decision Tree):\n", confusion_matrix(y_test_selected, y_pred_selected_decision_tree))
print(f"Validação cruzada com as melhores características (Decision Tree):\nAcurácia média: {mean_accuracy_selected_decision_tree:.4f}, Desvio padrão: {std_accuracy_selected_decision_tree:.4f}")


Relatório de Classificação com as melhores características (decision_tree):
               precision    recall  f1-score   support

           0       0.88      0.90      0.89       110
           1       0.90      0.88      0.89       109

    accuracy                           0.89       219
   macro avg       0.89      0.89      0.89       219
weighted avg       0.89      0.89      0.89       219

Matriz de Confusão com as melhores características (decision_tree):
 [[99 11]
 [13 96]]
Validação cruzada com as melhores características (decision_tree):
Acurácia média: 0.8703, Desvio padrão: 0.0256
