In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import mutual_info_classif,chi2
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import CondensedNearestNeighbour

import category_encoders as ce

plt.style.use('seaborn-colorblind')
%matplotlib inline

In [3]:
columns = [
    'age',
    'workclass',
    'fnlwgt',
    'education',
    'education-num',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'capital-gain',
    'capital-loss',
    'hours-per-week',
    'native-country',
    'label'
]


In [130]:
data = pd.read_csv('adult.data', names=columns,index_col=False)

In [5]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [25]:
data.isnull().sum()

age               2399
workclass         2399
fnlwgt            2399
education         2399
education-num     2399
marital-status    2399
occupation        2399
relationship      2399
race              2399
sex               2399
capital-gain      2399
capital-loss      2399
hours-per-week    2399
native-country    2399
label             2399
dtype: int64

Почистим пробелы

In [139]:
object_columns = data.dtypes[data.dtypes == 'object'].index.values

In [132]:
data[object_columns]=data[object_columns].apply(lambda s: s.str.strip())

Заменим ? на None

In [133]:
for column in object_columns:
    data[column][data[column] == '?'] = None
data.label = data.label.map({'<=50K': 0, '>50K':1})

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Проверим есть ли пропущенные заначения

In [41]:
def check_missing(data):    
    result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
    return result

In [42]:
check_missing(data=data)

Unnamed: 0,total missing,proportion
age,0,0.0
workclass,1836,0.056386
fnlwgt,0,0.0
education,0,0.0
education-num,0,0.0
marital-status,0,0.0
occupation,1843,0.056601
relationship,0,0.0
race,0,0.0
sex,0,0.0


Удаление пропущенных значений

In [43]:
def drop_missing(data,axis=0):
    data_copy = data.copy(deep=True)
    data_copy = data_copy.dropna(axis=axis,inplace=False)
    return data_copy

In [44]:
data2 = drop_missing(data=data)
print(f"Данных было:{data.shape}")
print(f"Данных стало:{data2.shape}")

Данных было:(32561, 15)
Данных стало:(30162, 15)


Добавление переменной оценки пропущенных значений

In [45]:
def add_var_denote_NA(data,NA_col=[]):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
        else:
            print(f"Нет пропущенных значений {i}")          
    return data_copy

In [63]:
data3 = add_var_denote_NA(data=data,NA_col=object_columns)

for col in object_columns:
    if hasattr(data3, f"{col}_is_NA"):
        print(f"Пропуски в {col}_is_NA")
        print(data3[f"{col}_is_NA"].value_counts())
data3.head(8)

Нет пропущенных значений education
Нет пропущенных значений marital-status
Нет пропущенных значений relationship
Нет пропущенных значений race
Нет пропущенных значений sex
Нет пропущенных значений label
Пропуски в workclass_is_NA
0    30725
1     1836
Name: workclass_is_NA, dtype: int64
Пропуски в occupation_is_NA
0    30718
1     1843
Name: occupation_is_NA, dtype: int64
Пропуски в native-country_is_NA
0    31978
1      583
Name: native-country_is_NA, dtype: int64


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,workclass_is_NA,occupation_is_NA,native-country_is_NA
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,0,0,0
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,0,0,0
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,0,0,0
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,0,0,0


Заполнение пропусков выборочным значением

In [66]:
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):  
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
        else:
            print(f"Нет пропущенных значений {i}")
    return data_copy

In [69]:
data4 = impute_NA_with_arbitrary(data=data,impute_value=-999,NA_col=object_columns)
data4.head(8)

Нет пропущенных значений education
Нет пропущенных значений marital-status
Нет пропущенных значений relationship
Нет пропущенных значений race
Нет пропущенных значений sex
Нет пропущенных значений label


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,workclass_-999,occupation_-999,native-country_-999
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,State-gov,Adm-clerical,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Self-emp-not-inc,Exec-managerial,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,Private,Handlers-cleaners,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,Private,Handlers-cleaners,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Private,Prof-specialty,Cuba
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,Private,Exec-managerial,United-States
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,Private,Other-service,Jamaica
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,Self-emp-not-inc,Exec-managerial,United-States


Заполнение пропусков случайными значениями

In [70]:
def impute_NA_with_random(data,NA_col=[],random_state=0):
    data_copy = data.copy(deep=True)
    for i in NA_col:
        if data_copy[i].isnull().sum()>0:
            data_copy[i+'_random'] = data_copy[i]
            random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
            random_sample.index = data_copy[data_copy[i].isnull()].index
            data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
        else:
            print(f"Нет пропущенных значений {i}")
    return data_copy 

In [71]:
data7 = impute_NA_with_random(data=data,NA_col=object_columns)
data7.head(8)

Нет пропущенных значений education
Нет пропущенных значений marital-status
Нет пропущенных значений relationship
Нет пропущенных значений race
Нет пропущенных значений sex
Нет пропущенных значений label


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label,workclass_random,occupation_random,native-country_random
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,State-gov,Adm-clerical,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Self-emp-not-inc,Exec-managerial,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,Private,Handlers-cleaners,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,Private,Handlers-cleaners,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Private,Prof-specialty,Cuba
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K,Private,Exec-managerial,United-States
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K,Private,Other-service,Jamaica
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K,Self-emp-not-inc,Exec-managerial,United-States


# Выбросы

Детекция с помощью выборочных значений

In [72]:
def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
    para = (upper_fence, lower_fence)
    tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))    
    return outlier_index, para

In [142]:
continuous_columns = data.dtypes[data.dtypes == 'int64'].index.values

In [76]:
for col in continuous_columns:
    print(f"Признак: {col}")
    index,para = outlier_detect_arbitrary(data=data,col=col,upper_fence=80,lower_fence=14)
    print('Верхняя граница:',para[0],'\nНижняя граница:',para[1],'\n')

Признак: age
Количество выбросов в данных: 99
Доля выбросов: 0.0030404471607137374
Верхняя граница: 80 
Нижняя граница: 14 

Признак: fnlwgt
Количество выбросов в данных: 32561
Доля выбросов: 1.0
Верхняя граница: 80 
Нижняя граница: 14 

Признак: education-num
Количество выбросов в данных: 29849
Доля выбросов: 0.9167101747489328
Верхняя граница: 80 
Нижняя граница: 14 

Признак: capital-gain
Количество выбросов в данных: 32561
Доля выбросов: 1.0
Верхняя граница: 80 
Нижняя граница: 14 

Признак: capital-loss
Количество выбросов в данных: 32561
Доля выбросов: 1.0
Верхняя граница: 80 
Нижняя граница: 14 

Признак: hours-per-week
Количество выбросов в данных: 1151
Доля выбросов: 0.03534903719173244
Верхняя граница: 80 
Нижняя граница: 14 



Интерквартильное расстояние

In [77]:
def outlier_detect_IQR(data,col,threshold=3):    
    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
    para = (Upper_fence, Lower_fence)
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [79]:
for col in continuous_columns:
    print(f"Признак: {col}")
    index,para = outlier_detect_IQR(data=data,col=col,threshold=2)
    print('Верхняя граница:',para[0],'\nНижняя граница:',para[1], '\n')

Признак: age
Количество выбросов в данных: 43
Доля выбросов: 0.0013205982617241485
Верхняя граница: 88.0 
Нижняя граница: -12.0 

Признак: fnlwgt
Количество выбросов в данных: 480
Доля выбросов: 0.014741561991339332
Верхняя граница: 475499.0 
Нижняя граница: -120621.0 

Признак: education-num
Количество выбросов в данных: 219
Доля выбросов: 0.00672583765854857
Верхняя граница: 18.0 
Нижняя граница: 3.0 

Признак: capital-gain
Количество выбросов в данных: 2712
Доля выбросов: 0.08328982525106723
Верхняя граница: 0.0 
Нижняя граница: 0.0 

Признак: capital-loss
Количество выбросов в данных: 1519
Доля выбросов: 0.0466509013850926
Верхняя граница: 0.0 
Нижняя граница: 0.0 

Признак: hours-per-week
Количество выбросов в данных: 6828
Доля выбросов: 0.209698719326802
Верхняя граница: 55.0 
Нижняя граница: 30.0 



Среднее-среднеквадратичное отклонение

In [80]:
def outlier_detect_mean_std(data,col,threshold=3):
    Upper_fence = data[col].mean() + threshold * data[col].std()
    Lower_fence = data[col].mean() - threshold * data[col].std()   
    para = (Upper_fence, Lower_fence)   
    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
    outlier_index = tmp.any(axis=1)
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index, para

In [82]:
for col in continuous_columns:
    print(f"Признак: {col}")
    index,para = outlier_detect_mean_std(data=data,col=col,threshold=2)
    print('Верхняя граница:',para[0],'\nНижняя граница:',para[1], '\n')

Признак: age
Количество выбросов в данных: 1158
Доля выбросов: 0.03556401830410614
Верхняя граница: 65.86251186248347 
Нижняя граница: 11.3007816481581 

Признак: fnlwgt
Количество выбросов в данных: 1249
Доля выбросов: 0.03835877276496422
Верхняя граница: 400878.3219061295 
Нижняя граница: -21321.58888195947 

Признак: education-num
Количество выбросов в данных: 1611
Доля выбросов: 0.04947636743343264
Верхняя граница: 15.226120004449875 
Нижняя граница: 4.935238676180325 

Признак: capital-gain
Количество выбросов в данных: 255
Доля выбросов: 0.00783145480789902
Верхняя граница: 15848.233013389408 
Нижняя граница: -13692.935325971945 

Признак: capital-loss
Количество выбросов в данных: 1485
Доля выбросов: 0.04560670741070606
Верхняя граница: 893.2242670329586 
Нижняя граница: -718.6166075630406 

Признак: hours-per-week
Количество выбросов в данных: 2203
Доля выбросов: 0.06765762722275115
Верхняя граница: 65.13231321555668 
Нижняя граница: 15.742598488629309 



Медианы абсолютного отклонения (MAD)

In [83]:
def outlier_detect_MAD(data,col,threshold=3.5):
    median = data[col].median()
    median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
    modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
    outlier_index = np.abs(modified_z_scores) > threshold
    print('Количество выбросов в данных:',outlier_index.value_counts()[1])
    print('Доля выбросов:',outlier_index.value_counts()[1]/len(outlier_index))
    return outlier_index

In [85]:
for col in continuous_columns:
    print(f"Признак: {col}")
    index = outlier_detect_MAD(data=data,col=col,threshold=3.5)

Признак: age
Количество выбросов в данных: 43
Доля выбросов: 0.0013205982617241485
Признак: fnlwgt
Количество выбросов в данных: 414
Доля выбросов: 0.012714597217530174
Признак: education-num
Количество выбросов в данных: 1611
Доля выбросов: 0.04947636743343264
Признак: capital-gain


  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


Количество выбросов в данных: 2712
Доля выбросов: 0.08328982525106723
Признак: capital-loss
Количество выбросов в данных: 1519
Доля выбросов: 0.0466509013850926
Признак: hours-per-week
Количество выбросов в данных: 6001
Доля выбросов: 0.1843002364792236


Замена выброса выборочным значением

In [86]:
def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
    data_copy = data.copy(deep=True)
    for i in col:
        data_copy.loc[outlier_index,i] = value
    return data_copy

In [87]:
data2 = impute_outlier_with_arbitrary(data=data,outlier_index=index,
                                         value=-999,col=continuous_columns)

In [91]:
data2[data2['education-num'] == -999].head(8)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,label
1,-999,Self-emp-not-inc,-999,Bachelors,-999,Married-civ-spouse,Exec-managerial,Husband,White,Male,-999,-999,-999,United-States,<=50K
6,-999,Private,-999,9th,-999,Married-spouse-absent,Other-service,Not-in-family,Black,Female,-999,-999,-999,Jamaica,<=50K
10,-999,Private,-999,Some-college,-999,Married-civ-spouse,Exec-managerial,Husband,Black,Male,-999,-999,-999,United-States,>50K
20,-999,Private,-999,Doctorate,-999,Married-civ-spouse,Prof-specialty,Husband,White,Male,-999,-999,-999,United-States,>50K
21,-999,Private,-999,HS-grad,-999,Separated,Other-service,Unmarried,Black,Female,-999,-999,-999,United-States,<=50K
27,-999,,-999,Some-college,-999,Married-civ-spouse,,Husband,Asian-Pac-Islander,Male,-999,-999,-999,South,>50K
28,-999,Private,-999,HS-grad,-999,Divorced,Exec-managerial,Not-in-family,White,Male,-999,-999,-999,United-States,<=50K
34,-999,State-gov,-999,Some-college,-999,Married-civ-spouse,Other-service,Husband,Black,Male,-999,-999,-999,United-States,<=50K


# Шкалирование данных

In [138]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['label'], axis=1), data.label, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((26048, 14), (6513, 14))

In [116]:
X_train[continuous_columns].head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
15282,36,174308,7,0,0,40
24870,35,198202,9,0,0,54
18822,38,52963,13,0,0,50
26404,50,138270,9,0,0,40
7842,68,116903,11,0,2149,40


Нормализация данных

In [102]:
X_train_copy = X_train.copy(deep=True)
for col in continuous_columns:
    ss = StandardScaler().fit(X_train[[col]])
    X_train_copy[f"{col}_zscore"] = ss.transform(X_train_copy[[col]])
X_train_copy.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,capital-loss,hours-per-week,native-country,label,age_zscore,fnlwgt_zscore,education-num_zscore,capital-gain_zscore,capital-loss_zscore,hours-per-week_zscore
15282,36,Private,174308,11th,7,Divorced,Transport-moving,Not-in-family,White,Male,...,0,40,United-States,0,-0.189281,-0.143997,-1.192884,-0.145891,-0.216579,-0.034732
24870,35,Private,198202,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Female,...,0,54,United-States,0,-0.26266,0.082715,-0.415106,-0.145891,-0.216579,1.10745
18822,38,Private,52963,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,...,0,50,United-States,0,-0.042523,-1.29535,1.140451,-0.145891,-0.216579,0.781113
26404,50,Private,138270,HS-grad,9,Married-civ-spouse,Sales,Wife,Black,Female,...,0,40,United-States,0,0.838021,-0.485935,-0.415106,-0.145891,-0.216579,-0.034732
7842,68,Self-emp-not-inc,116903,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,...,2149,40,United-States,0,2.158837,-0.688671,0.362672,-0.145891,5.092257,-0.034732
4890,51,Self-emp-not-inc,149220,Some-college,10,Married-civ-spouse,Farming-fishing,Husband,White,Male,...,0,75,United-States,0,0.9114,-0.382039,-0.026217,-0.145891,-0.216579,2.820723


Мин-Макс шкалирование

In [103]:
X_train_copy = X_train.copy(deep=True)
for col in continuous_columns:
    mms = MinMaxScaler().fit(X_train[[col]])
    X_train_copy[f"{col}_minmax"] = mms.transform(X_train_copy[[col]])
X_train_copy.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,capital-loss,hours-per-week,native-country,label,age_minmax,fnlwgt_minmax,education-num_minmax,capital-gain_minmax,capital-loss_minmax,hours-per-week_minmax
15282,36,Private,174308,11th,7,Divorced,Transport-moving,Not-in-family,White,Male,...,0,40,United-States,0,0.260274,0.110039,0.4,0.0,0.0,0.397959
24870,35,Private,198202,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Female,...,0,54,United-States,0,0.246575,0.126266,0.533333,0.0,0.0,0.540816
18822,38,Private,52963,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,...,0,50,United-States,0,0.287671,0.027627,0.8,0.0,0.0,0.5
26404,50,Private,138270,HS-grad,9,Married-civ-spouse,Sales,Wife,Black,Female,...,0,40,United-States,0,0.452055,0.085563,0.533333,0.0,0.0,0.397959
7842,68,Self-emp-not-inc,116903,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,...,2149,40,United-States,0,0.69863,0.071052,0.666667,0.0,0.493343,0.397959
4890,51,Self-emp-not-inc,149220,Some-college,10,Married-civ-spouse,Farming-fishing,Husband,White,Male,...,0,75,United-States,0,0.465753,0.093,0.6,0.0,0.0,0.755102


Робустное шкалирование

In [104]:
X_train_copy = X_train.copy(deep=True)

for col in continuous_columns:
    rs = RobustScaler().fit(X_train[[col]])
    X_train_copy[f'{col}_robust'] = rs.transform(X_train_copy[[col]])
X_train_copy.head(6)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,capital-loss,hours-per-week,native-country,label,age_robust,fnlwgt_robust,education-num_robust,capital-gain_robust,capital-loss_robust,hours-per-week_robust
15282,36,Private,174308,11th,7,Divorced,Transport-moving,Not-in-family,White,Male,...,0,40,United-States,0,-0.052632,-0.030971,-1.0,0.0,0.0,0.0
24870,35,Private,198202,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Female,...,0,54,United-States,0,-0.105263,0.16955,-0.333333,0.0,0.0,2.8
18822,38,Private,52963,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,...,0,50,United-States,0,0.052632,-1.049314,1.0,0.0,0.0,2.0
26404,50,Private,138270,HS-grad,9,Married-civ-spouse,Sales,Wife,Black,Female,...,0,40,United-States,0,0.684211,-0.333407,-0.333333,0.0,0.0,0.0
7842,68,Self-emp-not-inc,116903,Assoc-voc,11,Married-civ-spouse,Prof-specialty,Husband,White,Male,...,2149,40,United-States,0,1.631579,-0.512721,0.333333,0.0,2149.0,0.0
4890,51,Self-emp-not-inc,149220,Some-college,10,Married-civ-spouse,Farming-fishing,Husband,White,Male,...,0,75,United-States,0,0.736842,-0.241513,0.0,0.0,0.0,7.0


# Энкодинг переменных

One-Hot encoding

Энкодинг значением

In [143]:
ord_enc = ce.OrdinalEncoder(cols=object_columns).fit(X_train,y_train)

In [145]:
data4 = ord_enc.transform(data.drop(labels=['label'], axis=1))
data4.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,5.0,77516,3,13,2,3.0,1,1,1,2174,0,40,1.0
1,50,2.0,83311,3,13,3,2.0,3,1,1,0,0,13,1.0
2,38,1.0,215646,2,9,1,13.0,1,1,1,0,0,40,1.0
3,53,1.0,234721,1,7,3,13.0,3,2,1,0,0,40,1.0
4,28,1.0,338409,3,13,3,5.0,2,2,2,0,0,40,25.0


Таргет энкодин

In [146]:
target_enc = ce.TargetEncoder(cols=object_columns).fit(X_train,y_train)

In [148]:
data2 = target_enc.transform(data.drop(labels=['label'], axis=1))
data2.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,0.258687,77516,0.418435,13,0.044455,0.132589,0.100709,0.254738,0.305831,2174,0,40,0.24511
1,50,0.28628,83311,0.418435,13,0.447245,0.484047,0.449796,0.254738,0.305831,0,0,13,0.24511
2,38,0.216999,215646,0.15929,9,0.102637,0.062387,0.100709,0.254738,0.305831,0,0,40,0.24511
3,53,0.216999,234721,0.046739,7,0.447245,0.062387,0.449796,0.123618,0.305831,0,0,40,0.24511
4,28,0.216999,338409,0.418435,13,0.447245,0.447449,0.470973,0.123618,0.10596,0,0,40,0.253333


WOE энкодинг

In [149]:
woe_enc = ce.WOEEncoder(cols=object_columns).fit(X_train,y_train)

In [155]:
data3 = woe_enc.transform(data.drop(labels=['label'], axis=1))
data3.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,0.103243,77516,0.824578,13,-1.911694,-0.722539,-1.034425,0.080237,0.334045,2174,0,40,0.028877
1,50,0.241139,83311,0.824578,13,0.94185,1.089829,0.952167,0.080237,0.334045,0,0,13,0.028877
2,38,-0.129434,215646,-0.509296,9,-1.012199,-1.542933,-1.034425,0.080237,0.334045,0,0,40,0.028877
3,53,-0.129434,234721,-1.839834,7,0.94185,-1.542933,0.952167,-0.802247,0.334045,0,0,40,0.028877
4,28,-0.129434,338409,0.824578,13,0.94185,0.942771,1.037574,-0.802247,-0.978098,0,0,40,0.106304


# Feature Selection

In [157]:
X_train, X_test, y_train, y_test = train_test_split(data3, 
                                                    data.label, test_size=0.2,
                                                    random_state=0)
X_train.shape, X_test.shape

((26048, 14), (6513, 14))

Константные значения

In [158]:
def constant_feature_detect(data,threshold=0.98):    
    data_copy = data.copy(deep=True)
    quasi_constant_feature = []
    for feature in data_copy.columns:
        predominant = (data_copy[feature].value_counts() / np.float(
                      len(data_copy))).sort_values(ascending=False).values[0]
        if predominant >= threshold:
            quasi_constant_feature.append(feature)
    print(len(quasi_constant_feature),'константные переменные')    
    return quasi_constant_feature

In [159]:
quasi_constant_feature = constant_feature_detect(data=X_train,threshold=0.9)

2 константные переменные


In [160]:
print(quasi_constant_feature)

['capital-gain', 'capital-loss']


Корреляционная фильтрация

In [161]:
def corr_feature_detect(data,threshold=0.8):
    
    corrmat = data.corr()
    corrmat = corrmat.abs().unstack() 
    corrmat = corrmat.sort_values(ascending=False)
    corrmat = corrmat[corrmat >= threshold]
    corrmat = corrmat[corrmat < 1] 
    corrmat = pd.DataFrame(corrmat).reset_index()
    corrmat.columns = ['feature1', 'feature2', 'corr']
   
    grouped_feature_ls = []
    correlated_groups = []
    
    for feature in corrmat.feature1.unique():
        if feature not in grouped_feature_ls:
    
            correlated_block = corrmat[corrmat.feature1 == feature]
            grouped_feature_ls = grouped_feature_ls + list(
                correlated_block.feature2.unique()) + [feature]
    
            correlated_groups.append(correlated_block)
    return correlated_groups

In [162]:
corr = corr_feature_detect(data=X_train,threshold=0.9)
for i in corr:
    print(i,'\n')

    feature1       feature2      corr
0  education  education-num  0.961104 



Взаимная информация

In [163]:
def mutual_info(X,y,select_k=10):

    if select_k >= 1:
        sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
        col = X.columns[sel_.get_support()]
        
    elif 0 < select_k < 1:
        sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
        col = X.columns[sel_.get_support()]   
        
    else:
        raise ValueError("select_k должно быть положительным значением")
    
    return col

In [164]:
mi = mutual_info(X=X_train,y=y_train,select_k=3)
print(mi)

Index(['marital-status', 'relationship', 'capital-gain'], dtype='object')


Одномерный ROC-AUC или MSE анализ

In [167]:
def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):

    roc_values = []
    for feature in X_train.columns:
        clf = DecisionTreeClassifier()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict_proba(X_test[feature].to_frame())
        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
    roc_values = pd.Series(roc_values)
    roc_values.index = X_train.columns
    print(roc_values.sort_values(ascending=False))
    print(len(roc_values[roc_values > threshold]), len(X_train.columns))
    keep_col = roc_values[roc_values > threshold]
    return keep_col

In [174]:
uni_roc_auc = univariate_roc_auc(X_train=X_train,y_train=y_train,
                                   X_test=X_test,y_test=y_test,threshold=0.7)
print(uni_roc_auc)

relationship      0.774262
marital-status    0.762386
occupation        0.727582
education-num     0.715439
education         0.715439
age               0.715133
hours-per-week    0.666561
capital-gain      0.617201
sex               0.609164
fnlwgt            0.579508
workclass         0.577530
capital-loss      0.560963
native-country    0.531911
race              0.530812
dtype: float64
6 14
age               0.715133
education         0.715439
education-num     0.715439
marital-status    0.762386
occupation        0.727582
relationship      0.774262
dtype: float64


In [169]:
def univariate_mse(X_train,y_train,X_test,y_test,threshold):

    mse_values = []
    for feature in X_train.columns:
        clf = DecisionTreeRegressor()
        clf.fit(X_train[feature].to_frame(), y_train)
        y_scored = clf.predict(X_test[feature].to_frame())
        mse_values.append(mean_squared_error(y_test, y_scored))
    mse_values = pd.Series(mse_values)
    mse_values.index = X_train.columns
    print(mse_values.sort_values(ascending=False))
    print(len(mse_values[mse_values > threshold]), len(X_train.columns))
    keep_col = mse_values[mse_values > threshold]
    return keep_col

In [175]:
uni_mse = univariate_mse(X_train=X_train,y_train=y_train,
                            X_test=X_test,y_test=y_test,threshold=0.2)
print(uni_mse)

fnlwgt            0.293782
native-country    0.183419
race              0.183388
workclass         0.179303
sex               0.177675
hours-per-week    0.172259
capital-loss      0.170466
age               0.164539
occupation        0.162512
education-num     0.159530
education         0.159530
capital-gain      0.154159
marital-status    0.149383
relationship      0.148520
dtype: float64
1 14
fnlwgt    0.293782
dtype: float64


# Oversampling/undersampling

In [176]:
len(y_train)

26048

In [177]:
sm = SMOTE(random_state=42)

In [178]:
X_res, y_res = sm.fit_resample(X_train, y_train)

In [179]:
len(y_res)

39604

In [180]:
cn = CondensedNearestNeighbour(random_state=42)

In [181]:
X_res, y_res = cn.fit_resample(X_train, y_train)

In [182]:
len(y_res)

12888