In [1]:
from utils import *

# Feature selection

In [2]:
data = pd.read_csv(os.path.join('..', 'data', 'processed', 'points_aggregated_data.csv'), delimiter=';')
columns = list(filter(lambda x: x not in (useless_columns + filter_columns + ['dpi', 'sens']), list(data.columns)))
data = data[columns + ['sex', 'respondent_id']]

In [3]:
# pick one random train test configuration
train_ids = [31,  3, 13, 28, 24, 30, 29, 32, 12, 10,  7,  5, 20, 16, 17, 15, 11, 25, 26,  4,  8, 22, 18, 27]
test_ids = [6, 14, 33, 23, 19,  2, 21,  9]

train = data[data['respondent_id'].isin(train_ids)]
test = data[data['respondent_id'].isin(test_ids)]

train = train.iloc[:,:109]
test = test.iloc[:,:109]

In [4]:
#prepare pipelines for ordinal and continous data
ordinal_pipe = Pipeline(steps=[
    ('encoder', OrdinalEncoder())
])

continous_pipe = Pipeline(steps=[
    ('transformer', PowerTransformer(method='yeo-johnson', standardize=True))
])

preprocessor = ColumnTransformer([
    ('continous', continous_pipe, [i for i in range(108)]),
    ('ordinal', ordinal_pipe, [108]),
])

processed_train = pd.DataFrame(preprocessor.fit_transform(train), columns=columns + ['sex'])
processed_test = pd.DataFrame(preprocessor.fit_transform(test), columns=columns + ['sex'])

## CatBoost Selection

In [5]:
model = CatBoostClassifier(logging_level='Silent', iterations=400, depth=4, learning_rate=0.05)

predictors_train = processed_train.iloc[:, [i for i in range(108)]]
target_train = processed_train.iloc[:, 108]

predictors_valid = processed_test.iloc[:, [i for i in range(108)]]
target_valid = processed_test.iloc[:, 108]

features = model.select_features(
    Pool(predictors_train, label=target_train),
    features_for_select='0-107', 
    num_features_to_select=15, 
    train_final_model=False, 
    plot=True,
    eval_set=Pool(predictors_valid, label=target_valid)
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [6]:
predictors_train.columns[features['selected_features']].values

array(['velocity_q95', 'velocity_x_median', 'velocity_y_mean_weighted',
       'acceleration_q5', 'acceleration_x_min', 'acceleration_x_q5',
       'acceleration_y_q5', 'acceleration_negative_mean',
       'velocity_angular_mean_weighted', 'velocity_angular_mean',
       'velocity_angular_std', 'curvature_min', 'curvature_q5',
       'curvature_mean', 'velocity_smooth_mean_weighted'], dtype=object)