In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import catboost
from catboost import EShapCalcType, EFeaturesSelectionAlgorithm

In [2]:
features = pd.read_csv('all_features.csv', index_col=0)
targets = pd.read_csv('za_klasifikaciju.csv', index_col=0)

targets = targets[['Valence', 'Arousal', 'Dominance', 'Liking']]
targets[targets < 4.5] = 0
targets[targets >= 4.5] = 1

In [7]:
features.isna().sum().sum()
features.fillna(0, inplace=True)

In [12]:
x_train, x_test, y_train, y_test = train_test_split(features, targets['Valence'], test_size=0.3)

In [13]:
len(features.columns)

1489

In [26]:
def find_n_best(features, c, n_features=50):
    x_train, x_test, y_train, y_test = train_test_split(features, targets[c], test_size=0.3)

    train_pool = catboost.Pool(data=x_train, label=y_train)
    test_pool = catboost.Pool(data=x_test, label=y_test)

    cat_params = {
        'loss_function':'Logloss',
        'eval_metric':'F1',
        'learning_rate':0.001,
        'depth':5,
        'subsample': 0.8
    }

    model = catboost.CatBoostClassifier(
        num_boost_round=5000,
        early_stopping_rounds=15
    )


    summary = model.select_features(
        train_pool,
        eval_set=test_pool,
        features_for_select='0-1488',
        num_features_to_select=50,
        steps=3,
        algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=True,
        logging_level='Silent',
        plot=False
    )
    return summary['selected_features_names']

In [19]:
summary.keys()

dict_keys(['selected_features', 'eliminated_features_names', 'eliminated_features', 'selected_features_names'])

In [31]:
cat_params = {
    'loss_function':'Logloss',
    'eval_metric':'F1',
    'learning_rate':0.001,
    'depth':5,
    'subsample': 0.8
}

for c in ['Valence', 'Arousal', 'Dominance', 'Liking']:

    feats_c = find_n_best(features, c, 200)
    cat_crossval = catboost.Pool(data=features[feats_c], label=targets[c])

    cat_cv = catboost.cv(pool=cat_crossval,
                        params=cat_params,
                        num_boost_round=5000,
                        nfold=10,
                        verbose_eval=0,
                        early_stopping_rounds=15
                        )
    ind_max = np.argmax(cat_cv['test-F1-mean'])
    print(f'{c}: test F1 mean = {cat_cv.loc[ind_max, "test-F1-mean"]}, std = {cat_cv.loc[ind_max, "test-F1-std"]}')

Stopped by overfitting detector  (15 iterations wait)
Valence: test F1 mean = 0.775064817587701, std = 0.002949963887987618
Stopped by overfitting detector  (15 iterations wait)
Arousal: test F1 mean = 0.779790896307484, std = 0.0015909335084335135
Stopped by overfitting detector  (15 iterations wait)
Dominance: test F1 mean = 0.8035778943302516, std = 0.0038008260569480953
Stopped by overfitting detector  (15 iterations wait)
Liking: test F1 mean = 0.8210360129714968, std = 0.001601198792484367


In [30]:
cat_params = {
    'loss_function':'Logloss',
    'eval_metric':'Accuracy',
    'learning_rate':0.001,
    'depth':5,
    'subsample': 0.8
}

for c in ['Valence', 'Arousal', 'Dominance', 'Liking']:

    feats_c = find_n_best(features, c, 200)
    cat_crossval = catboost.Pool(data=features[feats_c], label=targets[c])

    cat_cv = catboost.cv(pool=cat_crossval,
                        params=cat_params,
                        num_boost_round=5000,
                        nfold=10,
                        verbose_eval=0,
                        early_stopping_rounds=15
                        )
    ind_max = np.argmax(cat_cv['test-Accuracy-mean'])
    print(f'{c}: test Accuracy mean = {cat_cv.loc[ind_max, "test-Accuracy-mean"]}, std = {cat_cv.loc[ind_max, "test-Accuracy-std"]}')

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Stopped by overfitting detector  (15 iterations wait)
Valence: test Accuracy mean = 0.6328156473173412, std = 0.004557253919327737
Stopped by overfitting detector  (15 iterations wait)
Arousal: test Accuracy mean = 0.6421970850119025, std = 0.010403661531627728
Stopped by overfitting detector  (15 iterations wait)
Dominance: test Accuracy mean = 0.6718528257187326, std = 0.008005929215926811
Stopped by overfitting detector  (15 iterations wait)
Liking: test Accuracy mean = 0.6953125, std = 0.0
