[Публичная ссылка на ноутбук](https://colab.research.google.com/drive/1Dhg5Fkr_10OgwtSNq1FqHu_oNcJzhQVB?usp=sharing)

# Отбор признаков

В прошлом семестре приходилось руками отбирать признаки через статистические связи (матрица корреляций), уникальность значений каждого признака и тд.

Для этой задачи в sklearn существуют автоматизированные методы

In [8]:
import itertools

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

from sklearn.feature_selection import (SelectKBest, 
                                       chi2, f_classif, f_regression,
                                       RFE)
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

## 0 Подготовка данных

Загрузим данные, подготовим их к работе

In [9]:
url = 'https://raw.githubusercontent.com/otverskoj/First-steps-in-Data-Analysis/main/datasets/classification/occupancy_detection_preprocessed.csv'
names = ['date', 'temperature', 'humidity', 'light', 'co2', 'humidity_ratio', 'occupancy']
df = pd.read_csv(url, names=names, skiprows=1).reset_index(drop=True).drop(['date'], axis=1)
df.head()

Unnamed: 0,temperature,humidity,light,co2,humidity_ratio,occupancy
0,23.7,26.272,585.2,749.2,0.004764,1
1,23.718,26.29,578.4,760.4,0.004773,1
2,23.73,26.23,572.666667,769.666667,0.004765,1
3,23.7225,26.125,493.75,774.75,0.004744,1
4,23.754,26.2,488.6,779.0,0.004767,1


In [10]:
X, y = df.iloc[:, :-1].values, df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    stratify=y)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 1 Качество работы алгоритмов до отбора признаков

Протестируем алгоритмы классификации до отбора признаков, зафиксируем качество

In [11]:
def make_report(model_name, feature_selection, f1_score, df=None):
    '''Make report of models error before/after feature selection'''
    cols = ['model', 'feature_selection', 'f1_score']
    data = [
        [model_name, feature_selection, f1_score]
    ]
    local_df = pd.DataFrame(data, columns=cols)
    return local_df if df is None else df.append(local_df, ignore_index=True)

In [12]:
def write_to_report(model_names, feature_selection, scores, report_df=None):
    for model_name, score in zip(model_names, scores):
        report_df = make_report(model_name, feature_selection, score, report_df)
    return report_df

In [13]:
def get_avarage_f1_score(clf, X, y):
    skf = StratifiedKFold(n_splits=5)
    scores = []
    for train_index, test_index in skf.split(X, y):
        scaler = StandardScaler().fit(X[train_index])

        X_train = scaler.transform(X[train_index])
        X_test = scaler.transform(X[test_index])

        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        scores.append(f1_score(y_test, y_pred))
    return np.mean(scores)

Протестируем работу алгоритма логистической регрессии и дерева решений

In [14]:
lg_clf = LogisticRegression(penalty='l1', solver='liblinear')
logreg_f1_score = get_avarage_f1_score(lg_clf, X, y)

dt_clf = DecisionTreeClassifier()
dectree_f1_score = get_avarage_f1_score(dt_clf, X, y)

report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='No',
                            scores=(logreg_f1_score, dectree_f1_score))
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.803453


## 2 Одномерный отбор признаков

In [15]:
np.set_printoptions(precision=3)

In [16]:
score_funcs = (chi2, f_classif, f_regression)
ks = np.arange(3, len(df.columns))
params = itertools.product(score_funcs, ks)
best_avg_lg_score, best_avg_dt_score = -1, -1

In [17]:
for score_func, k in params:
    k_best = SelectKBest(score_func=score_func, k=k)
    fitted_k_best = k_best.fit(X, y)
    features = fitted_k_best.transform(X)
    
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [18]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/UFS',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

Unnamed: 0,model,feature_selection,f1_score
0,LogisticRegression,No,0.976511
1,DecisionTreeClassifier,No,0.803453
2,LogisticRegression,Yes/UFS,0.976834
3,DecisionTreeClassifier,Yes/UFS,0.864717


Видим, что качество работы моделей улучшилось

## 3 Рекурсивное исключение признаков

In [0]:
nums = np.arange(3, len(df.columns))
best_avg_lg_score, best_avg_dt_score = -1, -1

In [0]:
for n in nums:
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    rfe = RFE(lr_clf, n)
    fitted_rfe = rfe.fit(X, y)
    features = fitted_rfe.transform(X)

    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    rfe = RFE(dt_clf, n)
    fitted_rfe = rfe.fit(X, y)
    features = fitted_rfe.transform(X)

    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [0]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/RFE',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

С RFE качество улучшилось примерно как в UFS

## 4 Метод главных компонент

In [0]:
nums = (2, 3, 4)
best_avg_lg_score, best_avg_dt_score = -1, -1

In [0]:
for n in nums:
    pca = PCA(n)
    fitted_pca = pca.fit(X, y)
    features = fitted_pca.transform(X)
    
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [0]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/PCA',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

С использованием PCA качество стало ещё больше, чем с RFE и UFS

## 5 Отбор на основе важности признаков

In [0]:
n_highest_features = np.arange(2, len(df.columns))
best_avg_lg_score, best_avg_dt_score = -1, -1

In [0]:
etc = ExtraTreesClassifier()
etc.fit(X, y)
sorted_indices = np.argsort(etc.feature_importances_)
for n in n_highest_features:
    features = X[:, sorted_indices[:n]]
    
    lr_clf = LogisticRegression(penalty='l1', solver='liblinear')
    score = get_avarage_f1_score(lr_clf, features, y)
    best_avg_lg_score = score if score > best_avg_lg_score else best_avg_lg_score

    dt_clf = DecisionTreeClassifier()
    score = get_avarage_f1_score(dt_clf, features, y)
    best_avg_dt_score = score if score > best_avg_dt_score else best_avg_dt_score

In [0]:
report_df = write_to_report(model_names=('LogisticRegression', 'DecisionTreeClassifier'),
                            feature_selection='Yes/ETC',
                            scores=(best_avg_lg_score, best_avg_dt_score),
                            report_df=report_df)
report_df

С использованием ETC качество не изменилось

## 6 Вывод

После проведённых экспериментов получилось, что лучшую прибавку к качеству дал PCA, а худшую – отбор на основе важности признаков с использованием ExtraTreesClassifier

## TODO

* сделать вывод наилучших признаков, выбранных каждым методом

* добавить столбец с лучшими выбранными признаками в report_df

## For Nura

In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import (SelectKBest, 
                                       f_classif,
                                       RFE)
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

In [3]:
file_path = "./bank_data_set.csv"
df = pd.read_csv(file_path, index_col=0)
df.head()

Unnamed: 0,age,default,balance,housing,loan,day,duration,campaign,pdays,previous,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,0,1787,0,0,19,79,1,-1,0,...,0,0,0,0,1,0,0,0,0,1
1,33,0,4789,1,1,11,220,1,339,4,...,0,0,1,0,0,0,1,0,0,0
2,35,0,1350,1,0,16,185,1,330,1,...,0,0,0,0,0,0,1,0,0,0
3,30,0,1476,1,1,3,199,4,-1,0,...,1,0,0,0,0,0,0,0,0,1
4,59,0,0,1,0,5,226,1,-1,0,...,0,0,1,0,0,0,0,0,0,1


In [13]:
df.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'day', 'duration',
       'campaign', 'pdays', 'previous', 'y', 'job_admin.', 'job_blue-collar',
       'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired',
       'job_self-employed', 'job_services', 'job_student', 'job_technician',
       'job_unemployed', 'job_unknown', 'marital_divorced', 'marital_married',
       'marital_single', 'education_primary', 'education_secondary',
       'education_tertiary', 'education_unknown', 'contact_cellular',
       'contact_telephone', 'contact_unknown', 'month_apr', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_failure', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown'],
      dtype='object')

In [4]:
X, y = df.drop('y', axis=1).values, df.iloc[:, 10].values

scaler = StandardScaler()
scaler.fit(X)
scaled_X = scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, 
                                                    test_size=0.25, 
                                                    stratify=y)

In [5]:
logreg_clf = LogisticRegression()
logreg_clf.fit(X_train, y_train)
print("Точность логистической регрессии без отбора признаков: ", logreg_clf.score(X_test, y_test))

tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
print("Точность дерева решений без отбора признаков: ", tree_clf.score(X_test, y_test))

Точность логистической регрессии без отбора признаков:  0.8885941644562334
Точность дерева решений без отбора признаков:  0.8709106984969054


## 1. Одномерный отбор признаков 

Отбор признаков на основе статистической связи (корреляции) с целевой переменной

In [20]:
logreg_scores, tree_scores = [], []
for k in range(3, 11):
    k_best = SelectKBest(score_func=f_classif, k=k)
    fitted = k_best.fit(scaled_X, y)
    features = fitted.transform(scaled_X)

    X_train, X_test, y_train, y_test = train_test_split(features, y,
                                                        test_size=0.25,
                                                        stratify=y)

    logreg_clf = LogisticRegression()
    logreg_clf.fit(X_train, y_train)
    logreg_scores.append((k, logreg_clf.score(X_test, y_test)))

    tree_clf = DecisionTreeClassifier()
    tree_clf.fit(X_train, y_train)
    tree_scores.append((k, tree_clf.score(X_test, y_test)))

avg_logreg_score = sum([x[1] for x in logreg_scores]) / len(logreg_scores)
avg_tree_score = sum([x[1] for x in tree_scores]) / len(tree_scores)

In [21]:
print(f"Точность логистической регрессии при одномерном отборе признаков: {avg_logreg_score}")
print(f"Точность дерева решений при одномерном отборе признаков: {avg_tree_score}")

Точность логистической регрессии при одномерном отборе признаков: 0.9022988505747127
Точность дерева решений при одномерном отборе признаков: 0.876105216622458


## 2. Рекурсивное исключение признаков

Работа метода заключается в рекурсивном удалении признаком с наименьшей "стоимостью". Сначала каждый признак получает свою оценку "стоимости". Затем та часть признаков, что имеет наименьшие "стоимости", удаляется. Процедура рекурсивно повторяется для нового набора признаков. 

In [26]:
logreg_scores, tree_scores = [], []
for n in range(3, len(df.columns)):
    logreg_clf = LogisticRegression()
    rfe = RFE(logreg_clf, n)
    fitted = rfe.fit(scaled_X, y)
    features = fitted.transform(scaled_X)
    X_train, X_test, y_train, y_test = train_test_split(features, y,
                                                        test_size=0.25,
                                                        stratify=y)
    logreg_clf.fit(X_train, y_train)
    logreg_scores.append((n, logreg_clf.score(X_test, y_test)))

    tree_clf = DecisionTreeClassifier()
    rfe = RFE(tree_clf, n)
    fitted = rfe.fit(scaled_X, y)
    features = fitted.transform(scaled_X)
    X_train, X_test, y_train, y_test = train_test_split(features, y,
                                                        test_size=0.25,
                                                        stratify=y)
    tree_clf.fit(X_train, y_train)
    tree_scores.append((n, tree_clf.score(X_test, y_test)))

avg_logreg_score = sum([x[1] for x in logreg_scores]) / len(logreg_scores)
avg_tree_score = sum([x[1] for x in tree_scores]) / len(tree_scores)

In [27]:
print(f"Точность логистической регрессии при рекурсивном исключении признаков: {avg_logreg_score}")
print(f"Точность дерева решений при рекурсивном исключении признаков: {avg_tree_score}")

Точность логистической регрессии при одномерном отборе признаков: 0.9021258601468496
Точность дерева решений при одномерном отборе признаков: 0.8636451005266599


## 3. Метод главных компонент

Данный метод пытается спроецировать исходное пространство признаков в новое n-мерное пространство

In [8]:
logreg_scores, tree_scores = [], []
for n in range(2, 5):
    pca = PCA(n)
    fitted = pca.fit(scaled_X, y)
    features = fitted.transform(scaled_X)

    X_train, X_test, y_train, y_test = train_test_split(features, y,
                                                        test_size=0.25,
                                                        stratify=y)
    
    logreg_clf = LogisticRegression()
    logreg_clf.fit(X_train, y_train)
    logreg_scores.append((n, logreg_clf.score(X_test, y_test)))

    tree_clf = DecisionTreeClassifier()
    tree_clf.fit(X_train, y_train)
    tree_scores.append((n, tree_clf.score(X_test, y_test)))

avg_logreg_score = sum([x[1] for x in logreg_scores]) / len(logreg_scores)
avg_tree_score = sum([x[1] for x in tree_scores]) / len(tree_scores)

In [9]:
print(f"Точность логистической регрессии при отборе признаков методом главных компонент: {avg_logreg_score}")
print(f"Точность дерева решений при отборе признаков методом главных компонент: {avg_tree_score}")

Точность логистической регрессии при отборе признаков методом главных компонент: 0.8850574712643678
Точность дерева решений при отборе признаков методом главных компонент: 0.8028293545534925


## 4. Отбор на основе важности признаков

Ансамблевые алгоритмы на основе деревьев решений, такие как случайный лес (random forest), позволяют оценить важность признаков.

In [10]:
logreg_scores, tree_scores = [], []
n_highest_features = range(2, len(df.columns))
etc = ExtraTreesClassifier()
etc.fit(X, y)
sorted_indices = np.argsort(etc.feature_importances_)
for n in n_highest_features:
    features = scaled_X[:, sorted_indices[:n]]
    
    X_train, X_test, y_train, y_test = train_test_split(features, y,
                                                        test_size=0.25,
                                                        stratify=y)
    
    logreg_clf = LogisticRegression()
    logreg_clf.fit(X_train, y_train)
    logreg_scores.append((n, logreg_clf.score(X_test, y_test)))

    tree_clf = DecisionTreeClassifier()
    tree_clf.fit(X_train, y_train)
    tree_scores.append((n, tree_clf.score(X_test, y_test)))

avg_logreg_score = sum([x[1] for x in logreg_scores]) / len(logreg_scores)
avg_tree_score = sum([x[1] for x in tree_scores]) / len(tree_scores)

In [11]:
print(f"Точность логистической регрессии при отборе признаков методом главных компонент: {avg_logreg_score}")
print(f"Точность дерева решений при отборе признаков методом главных компонент: {avg_tree_score}")

Точность логистической регрессии при отборе признаков методом главных компонент: 0.8862802641232576
Точность дерева решений при отборе признаков методом главных компонент: 0.8684086761856391
