In [1]:
import pandas as pd
import numpy as np
#import faiss
import time
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
#from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostClassifier
import warnings
import gc
warnings.simplefilter("ignore", UserWarning)


PATH = '/home/mglz/projects/master/stock_matching/data/'

# Load data #1

## Train

In [2]:
n_neigbours = 50

In [3]:
features_train = pd.read_csv(PATH + 'all_features_train.csv', index_col=0)
target_train = pd.read_csv(PATH + 'target_train.csv', index_col=0)
idx_d_train = pd.read_csv(PATH + 'faiss_idx_and_distances_train.csv', index_col=0)

In [4]:
features_train['distance'] = idx_d_train['1']
#features_train['index'] = idx_d_train['0']
#features_train['target'] = target_train

### Stratified sample

# CatBoost ranking

## Train model

In [5]:
X = np.array(features_train)
y = np.array(target_train)

In [6]:
del features_train, target_train, idx_d_train
gc.collect()

0

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,                         
                                                    random_state=42,
                                                   stratify=y)

In [8]:
print(X_train.shape)
print(X_test.shape)

(2000000, 145)
(500000, 145)


In [9]:
model = CatBoostClassifier(verbose=100, depth=5, 
                           iterations=1000, 
                           random_seed=42, 
                           loss_function='Logloss')
                           #bootstrap_type='Bernoulli',
                           #leaf_estimation_method='Newton',
                           #boosting_type='Ordered')
model.fit(X_train, y_train)

Learning rate set to 0.264529
0:	learn: 0.2418792	total: 259ms	remaining: 4m 18s
100:	learn: 0.0077572	total: 19.1s	remaining: 2m 50s
200:	learn: 0.0054017	total: 37.8s	remaining: 2m 30s
300:	learn: 0.0042486	total: 56s	remaining: 2m 9s
400:	learn: 0.0037426	total: 1m 13s	remaining: 1m 50s
500:	learn: 0.0033403	total: 1m 31s	remaining: 1m 31s
600:	learn: 0.0030645	total: 1m 49s	remaining: 1m 12s
700:	learn: 0.0028404	total: 2m 6s	remaining: 54s
800:	learn: 0.0026108	total: 2m 24s	remaining: 35.9s
900:	learn: 0.0024200	total: 2m 41s	remaining: 17.8s
999:	learn: 0.0022051	total: 2m 59s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fcfe2dcead0>

In [10]:
del X_train, y_train
gc.collect()

0

In [11]:
gc.collect()

0

In [12]:
pred = model.predict(X_test)
print(precision_score(y_test, pred))
print(accuracy_score(y_test, pred))

0.9384296091317883
0.998682


In [13]:
del X_test, y_test, pred
gc.collect()

0

## Searching best candidates

In [14]:
X = pd.DataFrame(X)

In [15]:
X['prob'] = model.predict_proba(X)[:, 1]
X['target'] = y

In [16]:
def best_candidates_search(df, n, cand_num, type):
    best_candidates_list = []
    for i in tqdm(range((len(df)// n))):
        best_c = df[n*i:n*(i+1)].sort_values(by='prob', ascending=False)[:cand_num]
        best_candidates_list.append(best_c)
        del best_c
    print('lists created...')    
    del df
    gc.collect()
    print('origin DF deleted...')    
    new_df = pd.DataFrame(np.concatenate(best_candidates_list))
    print('new DF created!')
    return new_df
    #new_df.to_csv(PATH + 'best_candidates_'+ type + '.csv')
    #print('new DF saved!') 

In [17]:
best5_train = best_candidates_search(X, n_neigbours, 5, 'train')

  0%|          | 0/50000 [00:00<?, ?it/s]

lists created...
origin DF deleted...
new DF created!


In [18]:
del X
gc.collect()

0

# Load data #2

## Validation 

In [19]:
features_valid = pd.read_csv(PATH + 'all_features_valid.csv', index_col=0)
target_valid = pd.read_csv(PATH + 'target_valid.csv', index_col=0)
idx_d_valid = pd.read_csv(PATH + 'faiss_idx_and_distances_valid.csv', index_col=0)

In [20]:
features_valid['distance'] = idx_d_valid['1']

In [21]:
del idx_d_valid
gc.collect()

0

## Searching best candidates

In [22]:
features_valid['prob'] = model.predict_proba(features_valid)[:, 1]
features_valid['target'] = target_valid

In [23]:
best5_valid = best_candidates_search(features_valid, n_neigbours, 5, 'valid')

  0%|          | 0/50000 [00:00<?, ?it/s]

lists created...
origin DF deleted...
new DF created!


In [24]:
del features_valid
gc.collect()

0