# 🤖 Metody zespołowe (Ensemble Methods)

W tym laboratorium poznajemy **metody zespołowe (ensemble methods)**, które łączą wiele modeli bazowych, aby zwiększyć dokładność predykcji.

### Zakres ćwiczenia
- Równoległe i sekwencyjne metody zespołowe
- **Hard / Soft Voting**
- **Bagging** i jego warianty (z i bez bootstrapu)
- **Random Forest**
- **AdaBoost**
- **Gradient Boosting**
- Sampling cech i ranking dokładności estymatorów

Celem ćwiczenia jest zrozumienie, jak łączenie klasyfikatorów wpływa na dokładność i generalizację modelu.


In [None]:
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
import pickle


## 📊 Przygotowanie danych

In [None]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)
X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target']

# Podział 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Zbiór treningowy: {X_train.shape}, Zbiór testowy: {X_test.shape}")


## 🗳️ Hard / Soft Voting Classifiers

In [None]:
# Klasyfikatory bazowe
tree_clf = DecisionTreeClassifier(random_state=42)
log_clf = LogisticRegression(random_state=42, max_iter=1000)
knn_clf = KNeighborsClassifier()

# --- Hard Voting ---
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('tree', tree_clf), ('knn', knn_clf)], voting='hard')
voting_clf.fit(X_train, y_train)

hard_train_acc = accuracy_score(y_train, voting_clf.predict(X_train))
hard_test_acc = accuracy_score(y_test, voting_clf.predict(X_test))
print(f"Hard Voting - train: {hard_train_acc:.4f}, test: {hard_test_acc:.4f}")

# --- Soft Voting ---
voting_clf_soft = VotingClassifier(estimators=[('lr', log_clf), ('tree', tree_clf), ('knn', knn_clf)], voting='soft')
voting_clf_soft.fit(X_train, y_train)

soft_train_acc = accuracy_score(y_train, voting_clf_soft.predict(X_train))
soft_test_acc = accuracy_score(y_test, voting_clf_soft.predict(X_test))
print(f"Soft Voting - train: {soft_train_acc:.4f}, test: {soft_test_acc:.4f}")

# --- Accuracy pojedynczych modeli ---
tree_clf.fit(X_train, y_train)
log_clf.fit(X_train, y_train)
knn_clf.fit(X_train, y_train)

acc_list = [
    (accuracy_score(y_train, tree_clf.predict(X_train)), accuracy_score(y_test, tree_clf.predict(X_test))),
    (accuracy_score(y_train, log_clf.predict(X_train)), accuracy_score(y_test, log_clf.predict(X_test))),
    (accuracy_score(y_train, knn_clf.predict(X_train)), accuracy_score(y_test, knn_clf.predict(X_test))),
    (hard_train_acc, hard_test_acc),
    (soft_train_acc, soft_test_acc)
]

with open("acc_vote.pkl", "wb") as f:
    pickle.dump(acc_list, f)

# Zapisanie klasyfikatorów
clf_list = [tree_clf, log_clf, knn_clf, voting_clf, voting_clf_soft]
with open("vote.pkl", "wb") as f:
    pickle.dump(clf_list, f)


## 🧩 Bagging i warianty

In [None]:
# Bagging - podstawowy
bag_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=True, random_state=42)
bag_clf.fit(X_train, y_train)

# Bagging 50%
bag_clf_50 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=True, max_samples=0.5, random_state=42)
bag_clf_50.fit(X_train, y_train)

# Pasting
pas_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False, random_state=42)
pas_clf.fit(X_train, y_train)

# Pasting 50%
pas_clf_50 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.5, random_state=42)
pas_clf_50.fit(X_train, y_train)

# Random Forest, AdaBoost, Gradient Boosting
rfc = RandomForestClassifier(n_estimators=30, random_state=42)
ada_clf = AdaBoostClassifier(n_estimators=30, random_state=42)
gb_clf = GradientBoostingClassifier(n_estimators=30, random_state=42)

for model in [rfc, ada_clf, gb_clf]:
    model.fit(X_train, y_train)

# Zapis wyników
models = [bag_clf, bag_clf_50, pas_clf, pas_clf_50, rfc, ada_clf, gb_clf]
acc_list2 = [(accuracy_score(y_train, m.predict(X_train)), accuracy_score(y_test, m.predict(X_test))) for m in models]

with open("acc_bag.pkl", "wb") as f:
    pickle.dump(acc_list2, f)

with open("bag.pkl", "wb") as f:
    pickle.dump(models, f)

acc_list2


## 🎯 Sampling cech i ranking estymatorów

In [None]:
# Sampling 2 cech bez powtórzeń
X_train2, X_test2, y_train2, y_test2 = train_test_split(data_breast_cancer.data, data_breast_cancer.target, test_size=0.2, random_state=42)

bag_sam = BaggingClassifier(
    estimator=DecisionTreeClassifier(),
    n_estimators=30,
    max_features=2,
    bootstrap=True,
    max_samples=0.5,
    bootstrap_features=False
)
bag_sam.fit(X_train2, y_train2)

# Dokładność dla całego zespołu
acc_list3 = [accuracy_score(y_train2, bag_sam.predict(X_train2)), accuracy_score(y_test2, bag_sam.predict(X_test2))]
with open("acc_fea.pkl", "wb") as f:
    pickle.dump(acc_list3, f)

with open("fea.pkl", "wb") as f:
    pickle.dump([bag_sam], f)

# Ranking dokładności dla poszczególnych estymatorów
df_rank = pd.DataFrame(columns=['train_accuracy', 'test_accuracy', 'features'])
for i, est in enumerate(bag_sam.estimators_):
    features = bag_sam.estimators_features_[i]
    feature_names = data_breast_cancer.data.columns[features]
    train_acc = accuracy_score(y_train2, est.predict(X_train2[feature_names].values))
    test_acc = accuracy_score(y_test2, est.predict(X_test2[feature_names].values))
    df_rank.loc[len(df_rank)] = [train_acc, test_acc, feature_names.values]

df_rank.sort_values(by=['test_accuracy', 'train_accuracy'], ascending=False, inplace=True)

with open("acc_fea_rank.pkl", "wb") as f:
    pickle.dump(df_rank, f)

df_rank.head()
