In [35]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import (VarianceThreshold,
                                       SelectKBest,
                                       f_classif,
                                       SelectFromModel,
                                       SequentialFeatureSelector)

### 1. Сгенерируйте данные.

In [36]:
x_data_generated, y_data_generated = make_classification(scale=1)

### 2. Постройте модель логистической регрессии и оцените среднюю точность.

In [37]:
initial_accuracy = cross_val_score(LogisticRegression(), x_data_generated, y_data_generated, scoring='accuracy').mean()
print(f'Accuracy: {initial_accuracy}')

Accuracy: 0.9099999999999999


### 3. Используйте статистические методы для отбора признаков:

#### (a) Выберите признаки на основе матрицы корреляции.

In [38]:
data = pd.DataFrame(x_data_generated)
data['target'] = y_data_generated
correlation_matrix = data.corr()

In [39]:
correlation_with_target = correlation_matrix['target'].abs().sort_values(ascending=False)
selected_features_corr = correlation_with_target.index[1:11]
x_corr_selected = data[selected_features_corr]

In [40]:
corr_accuracy = cross_val_score(LogisticRegression(), x_corr_selected, y_data_generated, scoring='accuracy').mean()
print(f'Accuracy withou low corr featuress: {corr_accuracy}')

Accuracy withou low corr featuress: 0.96


#### (b) Отсеките низковариативные признаки (VarianceThreshold).

In [41]:
selector = VarianceThreshold()
x_var_threshold = selector.fit_transform(x_data_generated)

#### (c) Повторите п. 2 на отобранных признаках в п. 3a, п. 3b.

In [42]:
var_threshold_accuracy = cross_val_score(LogisticRegression(), x_var_threshold, y_data_generated, scoring='accuracy').mean()
print(f'Accuracy with VarianceThreshold: {var_threshold_accuracy}')

Accuracy with VarianceThreshold: 0.9099999999999999


### 4. Осуществите отбор признаков на основе дисперсионного анализа:

#### (a) Выберите 5 лучших признаков с помощью скоринговой функции для классификации f_classif (SelectKBest(f_classif, k=5)).

In [43]:
selector = SelectKBest(f_classif, k=5)
x_kbest = selector.fit_transform(x_data_generated, y_data_generated)

#### (b) Повторите п. 2 на отобранных признаках.

In [44]:
kbest_accuracy = cross_val_score(LogisticRegression(), x_kbest, y_data_generated, scoring='accuracy').mean()
print(f'Accuracy with f_classif-selected features: {kbest_accuracy}')

Accuracy with f_classif-selected features: 0.96


### 5. Отбор с использованием моделей:

#### (a) Реализуйте отбор признаков с помощью логистической регрессии. Отобранные признаки подайте далее на вход в саму логистическую регрессию (SelectFromModel). Используйте L1 регуляризацию.

In [45]:
model = LogisticRegression(penalty='l1', solver='liblinear')
selector = SelectFromModel(model)
x_l1_selected = selector.fit_transform(x_data_generated, y_data_generated)

#### (b) Реализуйте отбор признаков с помощью модели RandomForest и встроенного атрибута feature_impotance.

In [46]:
model = RandomForestClassifier()
model.fit(x_data_generated, y_data_generated)
importances = model.feature_importances_
indices = np.argsort(importances)[-10:]

In [47]:
x_rf_selected = x_data_generated[:, indices]

#### (c) Повторите п. 2 на отобранных признаках в п. 5a, п. 5b.

In [48]:
l1_accuracy = cross_val_score(LogisticRegression(), x_l1_selected, y_data_generated, scoring='accuracy').mean()
print(f'Accuracy with L1 regularization-selected features: {l1_accuracy}')

Accuracy with L1 regularization-selected features: 0.97


In [49]:
rf_accuracy = cross_val_score(LogisticRegression(), x_rf_selected, y_data_generated, scoring='accuracy').mean()
print(f'Acuracy with RandomForest-selected features: {rf_accuracy}')

Acuracy with RandomForest-selected features: 0.99


### 6. Перебор признаков:

#### (a) SequentialFeatureSelector.

In [50]:
model = LogisticRegression()
sfs = SequentialFeatureSelector(model, n_features_to_select=10)
x_sfs_selected = sfs.fit_transform(x_data_generated, y_data_generated)

#### (b) Повторите п. 2 на отобранных признаках.

In [51]:
sfs_accuracy = cross_val_score(LogisticRegression(), x_sfs_selected, y_data_generated, scoring='accuracy').mean()
print(f'Accuracy with SequentialFeatureSelector-selected features: {sfs_accuracy}')

Accuracy with SequentialFeatureSelector-selected features: 0.99


### 7. Сформулируйте выводы по проделанной работе:

#### (a) Сделайте таблицу вида |способ выбора признаков|количество признаков|средняя точность модели|.

In [52]:
results = pd.DataFrame({
    'Способ отбора признаков': [
        'Исходные признаки',
        'Матрица корреляции',
        'VarianceThreshold',
        'f_classif',
        'L1 регуляризация',
        'RandomForest',
        'SequentialFeatureSelector'
    ],
    'Кол-во признаков': [
        x_data_generated.shape[1],
        x_corr_selected.shape[1],
        x_var_threshold.shape[1],
        x_kbest.shape[1],
        x_l1_selected.shape[1],
        x_rf_selected.shape[1],
        x_sfs_selected.shape[1]
    ],
    'Средняя accuracy': [
        initial_accuracy,
        corr_accuracy,
        var_threshold_accuracy,
        kbest_accuracy,
        l1_accuracy,
        rf_accuracy,
        sfs_accuracy
    ]
})

In [53]:
results.sort_values(by='Средняя accuracy', ascending=False)

Unnamed: 0,Способ отбора признаков,Кол-во признаков,Средняя accuracy
5,RandomForest,10,0.99
6,SequentialFeatureSelector,10,0.99
4,L1 регуляризация,10,0.97
1,Матрица корреляции,10,0.96
3,f_classif,5,0.96
0,Исходные признаки,20,0.91
2,VarianceThreshold,20,0.91
