# Темы семинара: отбор признаков и приложения

- Фильтрационные методы
- Оберточные методы
- Встроенные методы
- Нелинейные классификаторы
- Сохранение модели

In [1]:
import numpy as np
import pandas as pd

In [5]:
POKEMON = "./Pokemon.csv"

In [6]:
data = pd.read_csv(POKEMON)

In [7]:
data.head(5)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,1,False


Columns description (it's crucial!)


- #: ID for each pokemon
- Name: Name of each pokemon
- Type 1: Each pokemon has a type, this determines weakness/resistance to attacks
- Type 2: Some pokemon are dual type and have 2
- Total: sum of all stats that come after this, a general guide to how strong a pokemon is
- HP: hit points, or health, defines how much damage a pokemon can withstand before fainting
- Attack: the base modifier for normal attacks (eg. Scratch, Punch)
- Defense: the base damage resistance against normal attacks
- SP Atk: special attack, the base modifier for special attacks (e.g. fire blast, bubble beam)
- SP Def: the base damage resistance against special attacks
- Speed: determines which pokemon attacks first each round

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   #           800 non-null    int64 
 1   Name        800 non-null    object
 2   Type 1      800 non-null    object
 3   Type 2      414 non-null    object
 4   Total       800 non-null    int64 
 5   HP          800 non-null    int64 
 6   Attack      800 non-null    int64 
 7   Defense     800 non-null    int64 
 8   Sp. Atk     800 non-null    int64 
 9   Sp. Def     800 non-null    int64 
 10  Speed       800 non-null    int64 
 11  Generation  800 non-null    int64 
 12  Legendary   800 non-null    bool  
dtypes: bool(1), int64(9), object(3)
memory usage: 75.9+ KB


In [9]:
# fillna and drop useless cols

display(data.isnull().sum())
data['Type 2'] = data['Type 2'].fillna('No 2nd type')

#               0
Name            0
Type 1          0
Type 2        386
Total           0
HP              0
Attack          0
Defense         0
Sp. Atk         0
Sp. Def         0
Speed           0
Generation      0
Legendary       0
dtype: int64

In [10]:
display(data.isnull().sum())

#             0
Name          0
Type 1        0
Type 2        0
Total         0
HP            0
Attack        0
Defense       0
Sp. Atk       0
Sp. Def       0
Speed         0
Generation    0
Legendary     0
dtype: int64

In [11]:
data.drop(columns=['#'], inplace=True)

In [12]:
data.drop(columns=['Name'], inplace=True)

In [13]:
data

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Grass,Poison,405,60,62,63,80,80,60,1,False
2,Grass,Poison,525,80,82,83,100,100,80,1,False
3,Grass,Poison,625,80,100,123,122,120,80,1,False
4,Fire,No 2nd type,309,39,52,43,60,50,65,1,False
...,...,...,...,...,...,...,...,...,...,...,...
795,Rock,Fairy,600,50,100,150,100,150,50,6,True
796,Rock,Fairy,700,50,160,110,160,110,110,6,True
797,Psychic,Ghost,600,80,110,60,150,130,70,6,True
798,Psychic,Dark,680,80,160,60,170,130,80,6,True


In [14]:
X = data.drop(columns='Legendary')
y = data['Legendary'].astype('int')

In [15]:
y.value_counts()

Legendary
0    735
1     65
Name: count, dtype: int64

In [16]:
y.value_counts(normalize=True)

Legendary
0    0.91875
1    0.08125
Name: proportion, dtype: float64

# Make some default pipeline

In [17]:
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from category_encoders.leave_one_out import LeaveOneOutEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_validate
import sklearn

In [18]:
data.head(2)

Unnamed: 0,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,Grass,Poison,318,45,49,49,65,65,45,1,False
1,Grass,Poison,405,60,62,63,80,80,60,1,False


In [31]:
# define cat_cols

cat_cols = ['Type 1', 'Type 2']

default_pipeline = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel='linear'))]
)

In [32]:
cv_res1 = cross_validate(default_pipeline,
                        X,
                        y,
                        cv=5,
                        scoring='f1',
                        n_jobs=-1,
                        return_train_score=True
                       )

In [33]:
cv_res1

{'fit_time': array([0.00920796, 0.00688624, 0.00825119, 0.00699186, 0.00631714]),
 'score_time': array([0.00357413, 0.00229406, 0.00329494, 0.00213289, 0.00208688]),
 'test_score': array([0.5       , 0.72727273, 0.47619048, 0.38095238, 0.64864865]),
 'train_score': array([0.71287129, 0.56097561, 0.6744186 , 0.7311828 , 0.72727273])}

In [34]:
cv_res1['test_score'].mean()

np.float64(0.5466128466128466)

# Make pipeline more complicated

In [35]:
# difficult pipeline

pipe_dif = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('model_', SVC(kernel='linear'))]
)

In [36]:
cv_res2 = cross_validate(pipe_dif,
                        X,
                        y,
                        cv=5,
                        scoring='f1',
                        n_jobs=-1,
                        return_train_score=True
                       )

cv_res2

{'fit_time': array([0.02871203, 0.02355599, 0.02981114, 0.02984619, 0.02354813]),
 'score_time': array([0.00528884, 0.00476909, 0.00475407, 0.00479102, 0.00478482]),
 'test_score': array([0.375     , 0.88888889, 0.5       , 0.66666667, 0.53658537]),
 'train_score': array([0.95145631, 0.89583333, 0.97142857, 0.96153846, 0.98076923])}

In [37]:
cv_res2['test_score'].mean()

np.float64(0.5934281842818427)

# Introduce feature selectors

In [38]:
data_tr = pipe_dif[:-1]

In [39]:
data_tr

In [40]:
X.shape

(800, 10)

In [41]:
X_tr = data_tr.fit_transform(X, y)
print(f'data shape after transformation is {X_tr.shape}')

data shape after transformation is (800, 1001)


1k признаков - многовато, добавим в пайплайн селектор

## Фильтрационные методы

Суть таких методов в том, чтобы для каждого признака посчитать некоторую метрику "связи" с целевым признаком. И в результате оставить топ-K признаков согласно выбранной метрике.

 - статистика хи-квадрат
 - метрика mutual information

In [43]:
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif

In [47]:
k_best = 50

pipe = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectKBest(score_func=mutual_info_classif, k=k_best)),
    ('model_', SVC(kernel='linear'))]
)

In [48]:
cv_res = cross_validate(pipe, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res

{'fit_time': array([0.84150004, 0.82081389, 0.84545207, 0.81301594, 0.80627513]),
 'score_time': array([0.00324011, 0.00345707, 0.0028379 , 0.00295901, 0.0031321 ]),
 'test_score': array([0.14285714, 0.72      , 0.71428571, 0.42105263, 0.58823529]),
 'train_score': array([0.78431373, 0.69565217, 0.75247525, 0.76      , 0.81632653])}

In [49]:
# k best нужно подбирать

cv_res['test_score'].mean()

np.float64(0.5172861565678903)

## Жадный метод отбора

In [51]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [52]:
rfe = RFE(LogisticRegression(max_iter=1000), n_features_to_select=k_best, step=30)

In [53]:
X_tr.shape

(800, 1001)

In [54]:
res = rfe.fit_transform(X_tr, y)
display(res.shape)
res

(800, 50)

array([[-1.39963712, -0.46932423, -0.98555744, ..., -0.41626469,
        -0.52135831, -0.81966779],
       [-1.39963712, -0.39249109, -0.48479877, ..., -0.10954899,
        -0.18003271, -0.81698899],
       [-1.39963712, -0.2900469 ,  0.42451538, ...,  0.67405053,
         0.73602435, -0.81341726],
       ...,
       [ 1.61195431,  1.21460205, -0.16049792, ...,  4.39362648,
         2.1076934 ,  4.18343684],
       [ 1.61195431,  1.48809144, -0.16049792, ...,  5.8291638 ,
         2.51608978,  4.18343684],
       [ 1.61195431, -0.05301918,  1.36562373, ...,  1.1636675 ,
         0.19750623,  2.6404483 ]], shape=(800, 50))

In [55]:
pipe_rfe = Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', RFE(LogisticRegression(max_iter=1000),
                      n_features_to_select=30,
                      step=30
                     )),
    ('model_', SVC(kernel='linear'))])

In [56]:
cv_res3 = cross_validate(pipe_rfe, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res3

{'fit_time': array([0.61824989, 0.71821809, 0.54749393, 0.63686132, 0.58288789]),
 'score_time': array([0.00287199, 0.00388193, 0.00298619, 0.00460386, 0.00301695]),
 'test_score': array([0.44444444, 0.8       , 0.70967742, 0.56      , 0.66666667]),
 'train_score': array([0.96226415, 0.86      , 0.91089109, 0.91836735, 0.89795918])}

In [57]:
cv_res3['test_score'].mean()

np.float64(0.6361577060931899)

## С помощью L1 регуляризации

In [60]:
from sklearn.feature_selection import SelectFromModel

In [61]:
sel = SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'), threshold=1e-5)

In [62]:
# пример

res = sel.fit_transform(X_tr, y)
display(res.shape)
res

(800, 49)

array([[-1.20562657, -0.44705251, -0.94218651, ..., -0.52135831,
        -0.72668962, -0.81966779],
       [-1.20562657, -0.37458929, -0.94218651, ..., -0.18003271,
        -0.70651232, -0.81698899],
       [-1.20562657, -0.24576578, -0.94218651, ...,  0.73602435,
        -0.6731154 , -0.81341726],
       ...,
       [-1.20562657,  0.66347512, -0.94218651, ...,  2.1076934 ,
         4.87819432,  4.18343684],
       [-1.20562657,  1.03195307, -0.94218651, ...,  2.51608978,
         4.87819432,  4.18343684],
       [-1.20562657, -0.13354138, -0.94218651, ...,  0.19750623,
         1.93926564,  2.6404483 ]], shape=(800, 49))

In [63]:
pipe_lasso =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'),
                                  threshold=1e-5)),
    ('model_', SVC(kernel='linear'))])

In [67]:
cv_res4 = cross_validate(pipe_lasso, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res4

{'fit_time': array([0.06391382, 0.05412292, 0.0469439 , 0.04279613, 0.05199099]),
 'score_time': array([0.00377131, 0.00351119, 0.00287914, 0.00332093, 0.00260305]),
 'test_score': array([0.44444444, 0.88      , 0.68965517, 0.58333333, 0.68421053]),
 'train_score': array([0.92307692, 0.875     , 0.95238095, 0.97029703, 0.92929293])}

In [68]:
cv_res4['test_score'].mean()

np.float64(0.6563286953014721)

# Нелинейные классификаторы

- SVM с ядром
- Наивный байесовский классификатор
- Метод k ближайших соседей

In [92]:
pipe_lasso2 =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'),
                                  threshold=1e-5)),
    ('model_', SVC(kernel='rbf', gamma=0.01))])

In [93]:
cv_res5 = cross_validate(pipe_lasso2, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res5

{'fit_time': array([0.05931997, 0.05366015, 0.05068707, 0.04466891, 0.05061579]),
 'score_time': array([0.00372195, 0.0046649 , 0.00379586, 0.0036211 , 0.00335121]),
 'test_score': array([0.52631579, 0.83333333, 0.43478261, 0.33333333, 0.7027027 ]),
 'train_score': array([0.83870968, 0.71264368, 0.86021505, 0.87755102, 0.875     ])}

In [94]:
cv_res5['test_score'].mean()

np.float64(0.5660935535077412)

Обучите наивный байесовский классификатор и метод k ближайших соседей вместо SVM в пайплайне выше.

In [95]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB

In [128]:
# your code here
pipe_knn =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'),
                                  threshold=1e-5)),
    ('model_', KNeighborsClassifier(n_neighbors=5))])

In [129]:
pipe_knn

In [130]:
cv_res5 = cross_validate(pipe_knn, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res5

{'fit_time': array([0.06420779, 0.05651116, 0.05158997, 0.04623485, 0.05223799]),
 'score_time': array([0.00606227, 0.00564504, 0.00591707, 0.00604987, 0.00579906]),
 'test_score': array([0.47058824, 0.72727273, 0.38095238, 0.33333333, 0.55172414]),
 'train_score': array([0.77083333, 0.77083333, 0.71111111, 0.78651685, 0.78787879])}

In [131]:
cv_res5['test_score'].mean() #50

np.float64(0.4927741629567187)

С помощью GridSearch подберите гиперпараметр KNN (число соседей) внутри пайплайна.

In [132]:
from sklearn.model_selection import GridSearchCV

In [133]:
pipe_knn =  Pipeline([
    ('cat_encoder_', LeaveOneOutEncoder(cols=cat_cols)),
    ('poly_featurizer_', PolynomialFeatures(degree=4)),
    ('scaler_', StandardScaler()),
    ('selector_', SelectFromModel(LogisticRegression(penalty='l1', max_iter=1000, solver='liblinear'),
                                  threshold=1e-5)),
    ('gridsearch_', GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors' : range(1,51)}))])

In [134]:
cv_res5 = cross_validate(pipe_knn, X, y, cv=5, scoring='f1', return_train_score=True)
cv_res5

{'fit_time': array([0.47104597, 0.46585393, 0.47804999, 0.44499993, 0.47837281]),
 'score_time': array([0.00616598, 0.00524211, 0.00546622, 0.00627208, 0.00611615]),
 'test_score': array([0.26666667, 0.63157895, 0.3       , 0.33333333, 0.47619048]),
 'train_score': array([0.45070423, 0.41176471, 0.66666667, 0.69047619, 0.65      ])}

In [135]:
cv_res5['test_score'].mean()

np.float64(0.4015538847117794)

# Сохранение и загрузка модели

In [136]:
import pickle

Обучим лучшую модель на всех данных и сохраним её в файл.

In [142]:
pipe_knn.fit(X, y)
model = pipe_knn

In [143]:
filename = 'best_model.pickle'
pickle.dump(model, open(filename, 'wb'))

Загрузка модели из файла

In [144]:
loaded_model = pickle.load(open(filename, 'rb'))

In [145]:
loaded_model.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Второй вариант

In [146]:
import joblib

filename = 'best_model.joblib'
joblib.dump(model, filename)

loaded_model = joblib.load(filename)

In [147]:
loaded_model.predict(X)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Сохранить список библиотек с версиями
!pip freeze > requirements.txt

# Если Anaconda/Miniconda
# Создание виртуального окружения и установка зависимостей
# !conda create -n "main313" python=3.13
# !conda activate main313
# !pip install -r requirements.txt #установит все библиотеки из requirements.txt

Почитать подробнее про сохранение модели в файл и загрузку из файла тут: https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/