In [None]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline

from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
df = pd.read_csv('heart.csv')
y = np.array(df['chd'])
X = df.drop(['chd'], axis=1)

map_dict = {'Present': 1, 'Absent':0}
X['famhist'] = X['famhist'].map(map_dict)
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
X_train.shape

In [None]:
X_train

## Strojenie hiperparametrów

### Grid Search

In [None]:
model_scv = SVC()

parameters = {
    'kernel': ['linear', 'rbf'],
    'C': [1, 3, 6, 10],
    'gamma': ['auto', 'scale']
}

clf_grid = GridSearchCV(model_scv, parameters, cv=3)

clf_grid.fit(X_train, y_train)

In [None]:
pd.DataFrame(clf_grid.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

### Random search

In [None]:
model_scv = SVC()

clf_rand = RandomizedSearchCV(model_scv, parameters, cv=3, n_iter=5)

clf_rand.fit(X_train, y_train)

In [None]:
pd.DataFrame(clf_rand.cv_results_)[['params', 'mean_test_score']].sort_values('mean_test_score', ascending=False)

### Znalazienie najlepszych parametrów lasu losowego

Wskazówki:
- Przejrzyj dokumentację, aby ocenić które hiperparametry wydają się warte optymalizowania,
- Nie optymalizuj liczby drzew, ustal jakąś sensowną ale małą, tak żeby się szybko liczyło,
- Ogranicz liczbę iteracji (`n_iter`) oraz walidacji (`cv`).


In [None]:
# Tu wpisz kod



## Selekcja zmiennych

In [None]:
pf = PolynomialFeatures(degree=3)

In [None]:
X_features = pf.fit_transform(X_train)

In [None]:
pf.get_feature_names(X_train.columns)

In [None]:
X_features.shape

In [None]:
def feature_names(selector):
    return np.array(pf.get_feature_names(X_train.columns))[selector.get_support()]

### SelectKBest

In [None]:
chi2_selector = SelectKBest(chi2, k=25)

In [None]:
chi2_selector.fit_transform(X_features, y_train)

In [None]:
feature_names(chi2_selector)

### Mutual information

In [None]:
mi_selector = SelectKBest(mutual_info_classif, k=25)

In [None]:
mi_selector.fit(X_features, y_train)

In [None]:
feature_names(mi_selector)

### Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE
estimator = LogisticRegression(solver='liblinear')
rfe_selector = RFE(estimator, n_features_to_select=25, step=5)
rfe_selector = rfe_selector.fit(X_features, y_train) 

In [None]:
feature_names(rfe_selector)

### Select From Model

In [None]:
model_selector = SelectFromModel(
    LogisticRegression(penalty="l1", C=0.000025, solver="liblinear"),
    threshold = "mean"
)

In [None]:
model_selector.fit_transform(X_features, y_train)


In [None]:
feature_names(model_selector)