In [1]:
import os
import numpy as np
import pickle
from catboost import CatBoostClassifier, cv, Pool
from sklearn.preprocessing import MinMaxScaler
import pandas
import catboost as cb
from sklearn.model_selection import KFold
from paramsearch import paramsearch
from itertools import product,chain
from tqdm import tqdm_notebook as tqdm



MSLR_PATH = '../../../MSLR-WEB10K'
MSLR_DUMPS_PATH = '../../../mslr_dumps'

params = {'depth':[3,1,2,6,4,5,7,8,9,10],
          'iterations': [10],
          'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3], 
          'l2_leaf_reg':[3,1,5,10,100],
          'border_count':[32,5,10,20,50,100,200],
          'ctr_border_count':[50,5,10,20,100,200],
          'thread_count':4}


# this function does 3-fold crossvalidation with catboostclassifier          
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in tqdm(kf.split(train_set)):
        train = train_set[train_index,:]
        test = train_set[test_index,:]
        
        labels = train_label[train_index]
        test_labels = train_label[test_index]
        
        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)
        
        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)
  
# this function runs grid search on several parameters
def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in tqdm(chain(ps.grid_search(['border_count']),
                      ps.grid_search(['ctr_border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth']))):
        print(prms)
        res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res,prms)
        print(res,prms, 'best:',ps.bestscore(),ps.bestparam())
    return ps.bestparam()

def parse_file(g):
    features = []
    answer = []

    for line in g:
        line_splitted = line.strip().split(' ')
        answer.append(int(line_splitted[0]))

        features_vector = []
        for element in line_splitted[1:]:
            element_splitted = element.split(':')
            features_vector.append(float(element_splitted[1]))
        features.append(features_vector)
    return {'X': np.array(features, dtype=np.float32), 'y': np.asarray(answer, dtype=np.int8)}


def get_mslr_data():
    for fold in os.listdir(MSLR_PATH):
        print(fold)
        cur_fold = {}

        with open(os.path.join(MSLR_PATH, fold, 'train.txt')) as f:
            cur_fold['train'] = parse_file(f)
        print('train')
        with open(os.path.join(MSLR_PATH, fold, 'test.txt')) as f:
            cur_fold['test'] = parse_file(f)
        print('test')
        with open(os.path.join(MSLR_PATH, fold, 'vali.txt')) as f:
            cur_fold['vali'] = parse_file(f)
        print('vali')
        yield fold, cur_fold


def init_folds():
    for fold_name, data in get_mslr_data():
        with open(os.path.join(MSLR_DUMPS_PATH, fold_name), 'wb') as f:
            pickle.dump(data, f)
            

def get_features_mapping(amount):
    output = []
    i = 1
    while len(output) < amount:
        if i % 5 != 0 and i % 5 != 4:
            output.append(i)
        i += 1
    return output

def load_folds():
    for fold_name in os.listdir(MSLR_DUMPS_PATH):
        with open(os.path.join(MSLR_DUMPS_PATH, fold_name), 'rb') as f:
            yield fold_name, pickle.load(f)


def filter_features(np_array):
#     exclude = [0] + list(range(101, 126)) + [128] + list(range(131, 137))
#     return np.delete(np_array, exclude, axis=1)
    include = get_features_mapping(19 * 3)
    include.remove(16)
    include.remove(17)
    include.remove(18)
    return MinMaxScaler().fit_transform(np_array[:, include])

def modify_answers(np_array):
    np_array[np_array == 1] = 0
    np_array[np_array == 2] = 1

    np_array[np_array == 3] = 1
    np_array[np_array == 4] = 1

    return np_array

def oversample(X, y):
    repeat_factor = int(len(y[y==0]) / len(y[y==1]))
    new_X = np.concatenate((X[y==0], X[y==1].repeat(repeat_factor, axis=0)), axis=0)
    new_y = np.concatenate((y[y==0], y[y==1].repeat(repeat_factor, axis=0)), axis=0)
    return new_X, new_y



In [7]:
def choose_best_model():
    fold_name, fold = next(load_folds())
    train = fold['train']
    eval_set = fold['vali']
    train['X'] = filter_features(train['X'])
    eval_set['X'] = filter_features(eval_set['X'])
    train['y'] = modify_answers(train['y'])
    eval_set['y'] = modify_answers(eval_set['y'])
    
    X, y = train['X'], train['y']
    bestparams = catboost_param_tune(params, X, y, [])
    return bestparams
    

def train_model():
    model = cb.CatBoostClassifier(depth=10, iterations=5000, learning_rate=0.005, thread_count=4,)
                                  #save_snapshot=True)

    baseline = None
    for fold_name, fold in load_folds():
        print(fold_name)
        train = fold['train']
        eval_set = fold['vali']
        train['X'] = filter_features(train['X'])
        eval_set['X'] = filter_features(eval_set['X'])
        train['y'] = modify_answers(train['y'])
        eval_set['y'] = modify_answers(eval_set['y'])
        
        X, y = oversample(train['X'], train['y'])
        pool = Pool(X, y)
        eval_pool = Pool(eval_set['X'], eval_set['y'])
#         if baseline is not None:
#             pool.set_baseline(baseline)
#             eval_pool.set_baseline(eval_baseline)
        print('Started training')

        model.fit(
            X, y,
            eval_set=(eval_set['X'], eval_set['y']),
            verbose=True,
            plot=True,
        )
#         baseline = model.predict(pool, prediction_type='RawFormulaVal')
#         eval_baseline = model.predict(eval_pool, prediction_type='RawFormulaVal')
        break
        
    return model

# bestparams = choose_best_model()
# trained_model = train_model()
trained_model = cb.CatBoostClassifier().load_model('catboost_model')

In [8]:
fold_name, fold = next(load_folds())
test = fold['test']
test['X'] = filter_features(test['X'])
test['y'] = modify_answers(test['y'])

In [9]:
print(trained_model.score(test['X'], test['y']))


0.828651720168


In [None]:
from sklearn.model_selection import cross_val_score
print(cross_val_score(trained_model, test['X'], test['y'], scoring='accuracy'))

In [10]:
from sklearn.metrics import confusion_matrix
true_values = test['y']
predicted_values = trained_model.predict(test['X'])
print(confusion_matrix(true_values, predicted_values))

[[199042   4968]
 [ 36555   1766]]


In [14]:
trained_model.save_model()
import pickle
with open('catboost_model.pkl', 'wb') as f:
    pickle.dump(trained_model, f)

In [7]:
trained_model = CatBoostClassifier(iterations=50, loss_function='Logloss', depth=10,
                               learning_rate=0.1, class_weights=[1, 4], border=0.7)
result = trained_model.get_feature_importance(X=test['X'], y=test['y'])

CatboostError: catboost/libs/algo/calc_fstr.cpp:197: train and test datasets should have the same feature count

In [9]:
X = test['X']

In [21]:
y = test['y']
X[y==0].shape)
X[y==1].repeat(5, axis=0).shape

(204010, 137)


(191605, 137)

In [22]:
np.concatenate((X[y==0], X[y==1].repeat(5, axis=0)), axis=0).shape


(395615, 137)

In [23]:
np.concatenate((y[y==0], y[y==1].repeat(5, axis=0)), axis=0).shape


(395615,)

In [25]:
oversample(X, y)

5


(array([[  7.00000000e+00,   2.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  7.00000000e+00,   3.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  7.00000000e+00,   3.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        ..., 
        [  2.99770000e+04,   1.00000000e+00,   1.00000000e+00, ...,
           0.00000000e+00,   1.96082000e+05,   5.20749321e+01],
        [  2.99770000e+04,   1.00000000e+00,   1.00000000e+00, ...,
           0.00000000e+00,   1.96082000e+05,   5.20749321e+01],
        [  2.99770000e+04,   1.00000000e+00,   1.00000000e+00, ...,
           0.00000000e+00,   1.96082000e+05,   5.20749321e+01]], dtype=float32),
 array([0, 0, 0, ..., 1, 1, 1], dtype=int8))

In [18]:
init_folds()

Fold4
train
test
vali
Fold1
train
test
vali
Fold2
train
test
vali
Fold5
train
test
vali
Fold3
train
test
vali


In [14]:
feature_importance = trained_model.get_feature_importance(test['X'], test['y'])

In [15]:
include = get_features_mapping(19 * 3)
include.remove(16)
include.remove(17)
include.remove(18)

In [16]:
for i, j in zip(include, feature_importance):
    print(i, j)

1 0.9998886275626093
2 0.2590262886852381
3 1.5815194319609835
6 1.7245759343857268
7 0.3081338221392781
8 2.1757460401305195
11 9.037517992333369
12 2.497815930751388
13 5.7415687907977535
21 0.3065074590379698
22 0.1335382409910288
23 0.7733841812830988
26 3.5882196952950736
27 0.12349250443778978
28 0.17906268494424832
31 3.366307156509057
32 0.047821198663354346
33 0.43155616655759577
36 5.1465092875457685
37 0.10777963572620804
38 1.5148102595132078
41 5.532787082067535
42 0.10541833685374692
43 1.5713743425168287
46 0.46643049391922875
47 0.673655579786749
48 4.903354293687211
51 5.187003757001774
52 0.09072530909912854
53 1.8052731779311808
56 3.374532806191194
57 0.271573130609776
58 3.0502204849715517
61 3.883619285448456
62 0.18256110306662035
63 3.3311673924852068
66 3.462068806137715
67 0.19211426733561593
68 1.436162263089875
71 0.0011436471150118332
72 0.8581667941147135
73 1.7298842137320798
76 0.005556345709979016
77 0.20749461860060692
78 2.3253795105379007
81 3.427669