In [1]:
import pandas as pd
import numpy as np
import faiss
import time
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV, cross_val_score
from catboost import CatBoostClassifier
import warnings
import gc
warnings.simplefilter("ignore", UserWarning)


PATH = '/home/maria_grig/projects/master/stock_matching/'

# Load data

## Base

### Base dataset

In [2]:
df_base = pd.read_csv(PATH + 'base.csv', index_col=0)
#df_base.head()

### Create index

In [3]:
dims = df_base.shape[1]
n_cells = 20 # how many cells
n_neigbours = 50
quantizer = faiss.IndexFlatL2(dims)
idx_l2 = faiss.IndexIVFFlat(quantizer, dims, n_cells)

In [4]:
idx_l2.train(np.ascontiguousarray(df_base).astype('float32'))

idx_l2.add(np.ascontiguousarray(df_base.values).astype('float32'))

In [5]:
base_index = {k: v for k, v in enumerate(df_base.index.to_list())}

#del df_base

# Train

In [6]:
df_train = pd.read_csv(PATH + 'train.csv', index_col=0)
#df_train.head()


In [7]:
targets = df_train["Target"]
df_train.drop("Target", axis=1, inplace=True)

In [8]:
idx_l2.nprobe
idx_l2.nprobe = 5

In [9]:
start = time.time()
vecs, idx = idx_l2.search(np.ascontiguousarray(df_train.values).astype('float32'), n_neigbours)
print('search took {}'.format(time.time() - start))

search took 893.4275171756744


In [10]:
start = time.time()

acc = 0

#dx_list = np.zeros(len(targets))
#n=0
for target, el in zip(targets.values.tolist(), idx.tolist()):
    acc += int(target in [base_index[r] for r in el])

    #n  +=1
    #if target in [base_index[r] for r in el]:
    #    idx_list[n] = 1
    #    acc +=1
    #n+=1
        #y_train.append(el)
print('search took {}'.format(time.time() - start))

print('neighbours found: {} %'.format(100 * acc / len(idx)))

search took 1.9366586208343506
neighbours found: 16.817 %


In [11]:
gc.collect()

0

In [12]:
base = np.array(df_base.reset_index(drop=True))
train = np.array(df_train.reset_index(drop=True))

In [13]:
del df_base
gc.collect()
del df_train
gc.collect()

0

## Train 1st model

In [14]:
df_list = []
target_list = []
for i in tqdm(range(10000)):
    q = [train[i] for index in idx[i]]
    p = [base[index] for index in idx[i]]
    t = np.array([1 if base_index[index] == targets.values.tolist()[i] else 0 for index in idx[i]])
    target_list.append(t)
    df_list.append(np.concatenate((q, p), axis = 1)) 
    del q
    del p
    del t    

  0%|          | 0/10000 [00:00<?, ?it/s]

In [15]:
pd.DataFrame(np.concatenate(target_list))[0].value_counts()

0
0    498292
1      1708
Name: count, dtype: int64

In [16]:
y = np.array(np.concatenate(target_list))
X = np.array(np.concatenate(df_list))

In [17]:
del target
gc.collect()
del df_list
gc.collect()

0

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2,                         
                                                    random_state=42,
                                                   stratify=y)

In [19]:
print(X_train.shape)
print(X_test.shape)

(400000, 144)
(100000, 144)


In [20]:
model = CatBoostClassifier(verbose=200)
model.fit(X_train, y_train)

Learning rate set to 0.13305
0:	learn: 0.3870796	total: 90ms	remaining: 1m 29s
200:	learn: 0.0030360	total: 7.92s	remaining: 31.5s
400:	learn: 0.0018904	total: 15.4s	remaining: 23s
600:	learn: 0.0013192	total: 22.8s	remaining: 15.1s
800:	learn: 0.0009522	total: 30.2s	remaining: 7.51s
999:	learn: 0.0007047	total: 37.7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f6354306350>

In [21]:
pred = model.predict(X_test)
precision_score(y_test, pred)

0.9214285714285714

In [22]:
prob = model.predict_proba(X)[:, 1]

In [None]:
X['prob'] = prob

In [None]:
for k in range((len(pred)// n_neigbours)):
    print()
    #print((prob[n_neigbours*k:n_neigbours*(k+1)][-5:]))
    p = [p for p in (prob[n_neigbours*k:n_neigbours*(k+1)][-5:])]
    #X = [x for x in (X.iloc[n_neigbours*k:n_neigbours*(k+1)][-5:])]
    #print(X)

In [None]:
plt.figure(figsize=(3,3))
class_frequency = pd.Series(idx_list.astype(int)).value_counts(normalize=True)
print(class_frequency)
class_frequency.plot(kind='bar');  

### Predictions & Metrics

In [None]:
scores_list = []

In [None]:
model_lr = LogisticRegression()
scores_lr = cross_val_score(model_lr, df_train.reset_index(drop=True), 
                            pd.Series(idx_list.astype(int)), cv=4, scoring='accuracy')


In [None]:
score = round(scores_lr.mean(),3)
scores_list.append(score)
score

In [None]:
model_dt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=5, min_samples_split=6)

scores_dt = cross_val_score(model_dt, df_train.reset_index(drop=True), 
                            pd.Series(idx_list.astype(int)), cv=4, scoring='accuracy')


In [None]:
score = round(scores_dt.mean(),3)
scores_list.append(score)
score

In [None]:
model_rf = RandomForestClassifier(n_estimators=25, max_depth=11, min_samples_split=8,min_samples_leaf=1)

scores_rf = cross_val_score(model_rf, df_train.reset_index(drop=True), 
                            pd.Series(idx_list.astype(int)), cv=4, scoring='accuracy')


In [None]:
score = round(scores_rf.mean(),3)
scores_list.append(score)
score

In [None]:
model_lgbm = RandomForestClassifier(n_estimators=115, max_depth=10)

scores_lgbm = cross_val_score(model_lgbm, df_train.reset_index(drop=True), 
                            pd.Series(idx_list.astype(int)), cv=4, scoring='accuracy')


In [None]:
score = round(scores_lgbm.mean(),3)
scores_list.append(score)
score

In [None]:
model_cb = CatBoostClassifier(verbose=200, iterations=200, depth=3)

scores_cb = cross_val_score(model_cb, df_train.reset_index(drop=True), 
                            pd.Series(idx_list.astype(int)), cv=4, scoring='accuracy')


In [None]:
score = round(scores_cb.mean(),3)
scores_list.append(score)
score

In [None]:
scores_list

In [None]:
metrics ={'accuracy': scores_list}
cols = ['Log Reg','Dec Tree','Rand Forest','LGBM','CatBoost']
pd.DataFrame(data=metrics, index=cols).sort_values(by='accuracy', ascending=False)

## Validation

In [None]:
df_valid = pd.read_csv(PATH + 'validation.csv', index_col=0)
df_valid.head()

In [None]:
valid_targets = pd.read_csv(PATH + 'validation_answer.csv', index_col=0)
valid_targets.head()

In [None]:
dims = df_base.shape[1]
n_cells = 7 # how many cells
quantizer = faiss.IndexFlatL2(dims)
idx_l2 = faiss.IndexIVFFlat(quantizer, dims, n_cells)

In [None]:
idx_l2.train(np.ascontiguousarray(df_base).astype('float32'))

idx_l2.add(np.ascontiguousarray(df_base.values).astype('float32'))

In [None]:
base_index = {k: v for k, v in enumerate(df_base.index.to_list())}

In [None]:
start = time.time()
vecs, idx = idx_l2.search(np.ascontiguousarray(df_valid.values).astype('float32'), n_neigbours)
print('search took {}'.format(time.time() - start))

In [None]:
vecs

In [None]:
idx

In [None]:
start = time.time()
acc = 0

idx_list_valid = np.zeros(len(valid_targets))
n=0
for target, el in zip(valid_targets.values.tolist(), idx.tolist()):
    #acc += int(target in [base_index[r] for r in el])
    #n  +=1
    if target in [base_index[r] for r in el]:
        idx_list_valid[n] = 1
        acc +=1
    n+=1
        #y_train.append(el)
print('search took {}'.format(time.time() - start))

print('neighbours found: {} %'.format(100 * acc / len(idx)))

In [None]:
X_valid = df_valid.reset_index(drop=True)
y_valid = idx_list_valid

In [None]:
model_cb.fit(df_train.reset_index(drop=True), 
                            pd.Series(idx_list.astype(int)))

In [None]:
y_valid_pred = model_cb.predict(X_valid)

In [None]:
y_valid_pred

In [None]:
pd.Series(y_valid_pred.astype(int)).unique()

In [None]:
plt.figure(figsize=(3,3))
class_frequency = pd.Series(y_valid_pred.astype(int)).value_counts(normalize=True)
print(class_frequency)
class_frequency.plot(kind='bar');  

In [None]:
bb