In [2]:
import pandas as pd
import os
from tqdm import tqdm_notebook

In [97]:
save_folder = '../../data/combined_dataset/'
!mkdir -p $save_folder

In [4]:
train_normal_filename = '../../data/prepaired_dataset/train_v3.fth'
test_normal_filename = '../../data/prepaired_dataset/test_v3.fth'

In [6]:
quantiles_folder = '../../data/coms_sep/quantiles_data'
quantiles_train_files = [os.path.join(quantiles_folder, x) for x in os.listdir(quantiles_folder) 
                         if (x.count('qnts_train')) and (not x.count('amount.csv'))]
quantiles_test_files = [os.path.join(quantiles_folder, x) for x in os.listdir(quantiles_folder)
                        if (x.count('qnts_test')) and (not x.count('amount.csv'))]
assert len(quantiles_train_files) == len(quantiles_test_files)

---

загружаем обычный датасет с подсчитанными признаками

In [7]:
train_normal = pd.read_feather(train_normal_filename)
test_normal = pd.read_feather(test_normal_filename)

In [54]:
train_normal['event_time'] = train_normal['event_time'].astype(str)
test_normal['event_time'] = test_normal['event_time'].astype(str)

---

загружаем woe

In [8]:
train_woe = pd.read_csv('../../data/coms_sep/train_woe.csv', index_col = 0, low_memory = False)
test_woe = pd.read_csv('../../data/coms_sep/test_woe.csv', index_col = 0, low_memory = False)

  mask |= (ar1 == a)


---

обьединяем все датасеты с квантилями в один датафрейм

In [9]:
def unite_quantiles(files):
    quantiles_list = [pd.read_csv(x, low_memory = False, index_col = 0) for x in files]
    uniq_ids_dates = lambda q_list: pd.concat([x[['user_id', 'short_date']] for x in q_list]).drop_duplicates().sort_values(['user_id', 'short_date']).reset_index().drop('index', axis = 1)
    ids_dates = uniq_ids_dates(quantiles_list)
    ids_dates_merged = ids_dates.copy()
    for quant_df in tqdm_notebook(quantiles_list):
        cols_for_na_fill = [x for x in quant_df.columns if not (x in ['user_id', 'short_date'])]
        ids_dates_merged = ids_dates_merged.merge(quant_df, how = 'left', on = ['user_id', 'short_date'])
        ids_dates_merged.loc[:, cols_for_na_fill] = ids_dates_merged.loc[:, cols_for_na_fill].fillna(-1000)
    return ids_dates_merged

In [10]:
%%time
train_quantiles = unite_quantiles(quantiles_train_files)
test_quantiles = unite_quantiles(quantiles_test_files)

  mask |= (ar1 == a)






CPU times: user 1min 24s, sys: 52.8 s, total: 2min 17s
Wall time: 2min 17s


In [88]:
%%time
train_quantiles.drop_duplicates(['user_id', 'short_date'], inplace = True)
test_quantiles.drop_duplicates(['user_id', 'short_date'], inplace = True)

CPU times: user 2.85 s, sys: 2.21 s, total: 5.06 s
Wall time: 5.05 s


----

соединяем normal с woe

In [22]:
train_woe = train_woe.drop(['label', 'event_description', 'short_date'], axis = 1)
train_woe.rename(columns = {x: f'woe_{x}' for x in  train_woe.columns if not x in ['event_time', 'user_id']},
                 inplace = True)

In [61]:
test_woe = test_woe.drop(['event_description'], axis = 1)
test_woe.rename(columns = {x: f'woe_{x}' for x in  test_woe.columns if not x in ['event_time', 'user_id']},
                 inplace = True)

In [75]:
train_woe.drop_duplicates(['user_id', 'event_time'], inplace = True)
test_woe.drop_duplicates(['user_id', 'event_time'], inplace = True)

In [76]:
%%time
tr_with_woe = train_normal.merge(train_woe, how = 'left', on = ['user_id', 'event_time'])
tr_woe_cols = [x for x in tr_with_woe.columns if x.startswith('woe_')]
tr_with_woe.loc[:, tr_woe_cols] = tr_with_woe.loc[:, tr_woe_cols].fillna(0.0)

CPU times: user 16.1 s, sys: 14.2 s, total: 30.3 s
Wall time: 30.3 s


In [77]:
%%time
te_with_woe = test_normal.merge(test_woe, how = 'left', on = ['user_id', 'event_time'])
te_woe_cols = [x for x in te_with_woe.columns if x.startswith('woe_')]
te_with_woe.loc[:, te_woe_cols] = te_with_woe.loc[:, te_woe_cols].fillna(0.0)

CPU times: user 7.79 s, sys: 4.25 s, total: 12 s
Wall time: 12 s


In [81]:
assert tr_with_woe.shape[0] == train_normal.shape[0]
assert te_with_woe.shape[0] == test_normal.shape[0]

----

добавляем квантили

In [92]:
%%time
tr_combined = tr_with_woe.merge(train_quantiles, how = 'left', on = ['user_id', 'short_date'])
tr_combined_quant_cols = [x for x in tr_combined.columns if x.count('_days_quantile_')]
tr_combined.loc[:, tr_combined_quant_cols] = tr_combined.loc[:, tr_combined_quant_cols].fillna(-1000)

CPU times: user 20.1 s, sys: 26.3 s, total: 46.4 s
Wall time: 46.3 s


In [93]:
%%time
te_combined = te_with_woe.merge(test_quantiles, how = 'left', on = ['user_id', 'short_date'])
te_combined_quant_cols = [x for x in te_combined.columns if x.count('_days_quantile_')]
te_combined.loc[:, te_combined_quant_cols] = te_combined.loc[:, te_combined_quant_cols].fillna(-1000)

CPU times: user 7.9 s, sys: 10.4 s, total: 18.3 s
Wall time: 18.3 s


In [95]:
assert tr_combined.shape[0] == train_normal.shape[0]
assert te_combined.shape[0] == test_normal.shape[0]

----

сохраняем все

In [98]:
%%time
tr_combined.to_feather(save_folder + 'train_v1.fth')
te_combined.to_feather(save_folder + 'test_v1.fth')

CPU times: user 7.08 s, sys: 8.84 s, total: 15.9 s
Wall time: 11.1 s


In [100]:
set(tr_combined.columns) - set(te_combined.columns), set(te_combined.columns) - set(tr_combined.columns)

({'label'}, set())