In [None]:
import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
from sklearn. datasets import load_breast_cancer
from sklearn. model_selection import train_test_split
from sklearn. preprocessing import StandardScaler

# 1. Carregar Dados
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

# 2. Divis o Treino/Teste
X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.3, random_state=42)

# 3. Padroniza o (Essencial para PCA, RFE e Lasso)
scaler = StandardScaler ()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print (f"Dataset carregado: {X. shape [1]} features para análise.")

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Selecionar os top 50 genes baseados em estatística ANOVA (f_classif)
selector_filter = SelectKBest(score_func=f_classif, k=5) # Outras score functions
selector_filter.fit(X_train, y_train)

# Quais foram escolhidos? (Máscara booleana)
mask = selector_filter.get_support()
selected_genes_filter = np.array(data.feature_names)[mask]

print("--- FILTER METHOD ---")
print(f"Top 5 Genes mais importantes: {selected_genes_filter[:5]}")

In [None]:
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression

# Usaremos Regressão Logística como base
estimator = LogisticRegression(max_iter=2000, solver='liblinear')

# RFE: Quero que sobrem apenas 20 genes
# step=0.1 significa remover 10% das features a cada iteração (para ser rápido)
rfe = RFE(estimator=estimator, n_features_to_select=5, step=1)
#rfe_cv = RFECV(estimator=estimator, step=0.1)
rfe.fit(X_train_scaled, y_train)

selected_genes_rfe = np.array(data.feature_names)[rfe.support_]

print("--- WRAPPER METHOD (RFE) ---")
print(f"Top 5 Genes mais importantes: {selected_genes_rfe}")

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Treino
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Importâncias
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1] # Ordenar decrescente

print("--- EMBEDDED METHOD (Random Forest) ---")
print("Top 5 Genes mais importantes:")
for i in range(5):
    print(f"\t'{data.feature_names[indices[i]]}': {importances[indices[i]]:.4f}")

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train_scaled) # PCA sempre nos dados escalados!

plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=y_train, palette='viridis', style=y_train, s=100)
plt.title("PCA: Projeção 2D dos Pacientes")
plt.xlabel("PC1 (Maior Variância)")
plt.ylabel("PC2 (Segunda Maior)")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# 1. Instanciar PCA sem limitar o número de componentes
# (Ou limitando ao máximo possível, que é o min(n_samples, n_features))
pca_full = PCA(n_components=None)
pca_full.fit(X_train_scaled)

# 2. Calcular a Variância Acumulada
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

# 3. Determinar matematicamente o corte (Ex: 90%)
limite_desejado = 0.90
n_components_ideal = np.argmax(cumulative_variance >= limite_desejado) + 1

print(f"Para preservar {limite_desejado*100}% da informação, precisamos de {n_components_ideal} componentes.")

# 4. Plotar o Scree Plot
plt.figure(figsize=(10, 5))
plt.plot(cumulative_variance, marker='o', linestyle='--', color='b')
plt.axhline(y=limite_desejado, color='r', linestyle='-', label=f'Corte de {limite_desejado*100}%')
plt.axvline(x=n_components_ideal-1, color='r', linestyle='--')
plt.xlabel('Número de Componentes')
plt.ylabel('Variância Explicada Acumulada')
plt.title('Scree Plot: Quantos componentes guardar?')
plt.legend()
plt.grid(True)
plt.show()