[Публичная ссылка на ноутбук](https://colab.research.google.com/drive/1Dhg5Fkr_10OgwtSNq1FqHu_oNcJzhQVB?usp=sharing)

# Отбор признаков

В прошлом семестре приходилось руками отбирать признаки через статистические связи (матрица корреляций), уникальность значений каждого признака и тд.

Для этой задачи в sklearn существуют автоматизированные методы

In [None]:
import itertools

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

from sklearn.feature_selection import (SelectKBest, 
                                       chi2, f_classif, f_regression,
                                       RFE)
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

## 0 Подготовка данных

Загрузим данные, подготовим их к работе

In [None]:
url = 'https://raw.githubusercontent.com/otverskoj/First-steps-in-Data-Analysis/main/datasets/classification/occupancy_detection_preprocessed.csv'
names = ['date', 'temperature', 'humidity', 'light', 'co2', 'humidity_ratio', 'occupancy']
df = pd.read_csv(url, names=names, skiprows=1).reset_index(drop=True).drop(['date'], axis=1)
df.head()

Unnamed: 0,temperature,humidity,light,co2,humidity_ratio,occupancy
0,23.7,26.272,585.2,749.2,0.004764,1
1,23.718,26.29,578.4,760.4,0.004773,1
2,23.73,26.23,572.666667,769.666667,0.004765,1
3,23.7225,26.125,493.75,774.75,0.004744,1
4,23.754,26.2,488.6,779.0,0.004767,1


In [None]:
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    stratify=y)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 1 Качество работы алгоритмов до отбора признаков

Протестируем алгоритмы классификации до отбора признаков, зафиксируем качество

In [None]:
def make_report(model_name, feature_selection, f1_score, df=None):
    '''Make report of models error before/after feature selection'''
    cols = ['model', 'feature_selection', 'f1_score']
    data = [
        [model_name, feature_selection, f1_score]
    ]
    local_df = pd.DataFrame(data, columns=cols)
    return local_df if df is None else df.append(local_df, ignore_index=True)

In [None]:
def write_to_report(model_names, feature_selection, scores, report_df=None):
    for model_name, score in zip(model_names, scores):
        report_df = make_report(model_name, feature_selection, score, report_df)
    return report_df

In [None]:
def get_avarage_f1_score(clf, X, y):
    skf = StratifiedKFold(n_splits=5)
    scores = []
    for train_index, test_index in skf.split(X, y):
        scaler = StandardScaler().fit(X[train_index])

        X_train = scaler.transform(X[train_index])
        X_test = scaler.transform(X[test_index])

        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        scores.append(f1_score(y_test, y_pred))
    return np.mean(scores)

Протестируем работу алгоритма логистической регрессии и дерева решений

In [None]:
lg_clf = LogisticRegression(penalty='l1', solver='liblinear')
logreg_f1_score = get_avarage_f1_score(lg_clf, X, y)

dt_clf = DecisionTreeClassifier()
dectree_f1_score = get_avarage_f1_score(dt_clf, X, y)

report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='No',
                            scores=(logreg_f1_score, dectree_f1_score))
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.800973


## 2 Одномерный отбор признаков

In [None]:
# params = {
#     'score_func': (chi2, f_classif, f_regression),
#     'k': np.arange(3, len(df.columns))
# }
# selector_params = itertools.product(*[v for _, v in params.items()])
# print(params.keys().index())

In [None]:
# def select_features(selector, params, X, y, return_scores=True):
#     selector_params = itertools.product(*[v for _, v in params.items()])
#     best_avg_lg_score, best_avg_dt_score = -1, -1

#     for params in selector_params:  
#         k_best = SelectKBest(score_func=score_func, k=k)
#         fitted_k_best = k_best.fit(X, y)
#         features = fitted_k_best.transform(X)
        
#         lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
#         score = get_avarage_f1_score(lr_clf, features, y)
#         best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score
        
#         dt_clf = DecisionTreeClassifier()
#         score = get_avarage_f1_score(dt_clf, features, y)
#         best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score
    
#     return features, best_avg_lg_score, best_avg_dt_score

In [None]:
np.set_printoptions(precision=3)

In [None]:
score_funcs = (chi2, f_classif, f_regression)
ks = np.arange(3, len(df.columns))
params = itertools.product(score_funcs, ks)
best_avg_lg_score, best_avg_dt_score = -1, -1

In [None]:
for score_func, k in params:
    k_best = SelectKBest(score_func=score_func, k=k)
    fitted_k_best = k_best.fit(X, y)
    features = fitted_k_best.transform(X)
    
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [None]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/UFS',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.800973
2,LogisticRegression,Yes/UFS,0.976834
3,DecisionTreeClassifier,Yes/UFS,0.856621


Видим, что качество работы моделей улучшилось

## 3 Рекурсивное исключение признаков

In [None]:
nums = np.arange(3, len(df.columns))
best_avg_lg_score, best_avg_dt_score = -1, -1

In [None]:
for n in nums:
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    rfe = RFE(lr_clf, n)
    fitted_rfe = rfe.fit(X, y)
    features = fitted_rfe.transform(X)

    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    rfe = RFE(dt_clf, n)
    fitted_rfe = rfe.fit(X, y)
    features = fitted_rfe.transform(X)

    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [None]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/RFE',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.800973
2,LogisticRegression,Yes/UFS,0.976834
3,DecisionTreeClassifier,Yes/UFS,0.856621
4,LogisticRegression,Yes/RFE,0.976818
5,DecisionTreeClassifier,Yes/RFE,0.853561


С RFE качество улучшилось примерно как в UFS

## 4 Метод главных компонент

In [None]:
nums = (2, 3, 4)
best_avg_lg_score, best_avg_dt_score = -1, -1

In [None]:
for n in nums:
    pca = PCA(n)
    fitted_pca = pca.fit(X, y)
    features = fitted_pca.transform(X)
    
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [None]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/PCA',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.800973
2,LogisticRegression,Yes/UFS,0.976834
3,DecisionTreeClassifier,Yes/UFS,0.856621
4,LogisticRegression,Yes/RFE,0.976818
5,DecisionTreeClassifier,Yes/RFE,0.853561
6,LogisticRegression,Yes/PCA,0.977262
7,DecisionTreeClassifier,Yes/PCA,0.890993


С использованием PCA качество стало ещё больше, чем с RFE и UFS

## 5 Отбор на основе важности признаков

In [None]:
n_highest_features = np.arange(2, len(df.columns))
best_avg_lg_score, best_avg_dt_score = -1, -1

In [None]:
etc = ExtraTreesClassifier()
etc.fit(X, y)
sorted_indices = np.argsort(etc.feature_importances_)
for n in n_highest_features:
    features = X[:, sorted_indices[:n]]
    
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [None]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/ETC',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.800973
2,LogisticRegression,Yes/UFS,0.976834
3,DecisionTreeClassifier,Yes/UFS,0.856621
4,LogisticRegression,Yes/RFE,0.976818
5,DecisionTreeClassifier,Yes/RFE,0.853561
6,LogisticRegression,Yes/PCA,0.977262
7,DecisionTreeClassifier,Yes/PCA,0.890993
8,LogisticRegression,Yes/ETC,0.976511
9,DecisionTreeClassifier,Yes/ETC,0.809934


С использованием ETC качество не изменилось

## 6 Вывод

После проведённых экспериментов получилось, что лучшую прибавку к качеству дал PCA, а худшую – отбор на основе важности признаков с использованием ExtraTreesClassifier

## TODO

* сделать вывод наилучших признаков, выбранных каждым методом

* добавить столбец с лучшими выбранными признаками в report_df