In [104]:
import pandas as pd
import numpy as np
import datetime
import tqdm
import matplotlib.pyplot as plt
import pytictoc
import multiprocessing

In [2]:
data_train = pd.read_csv('../data/coms_sep/train.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
import os
import re
FIRST_N = 100
N_THREADS = 5
test_folder = '../data/raw_splits/test/'
test_files = sorted([x for x in os.listdir(test_folder) if not '.pkl' in x], key = lambda x: int(re.sub('[^0-9]', '', x)))
test_files = [os.path.join(test_folder, x) for x in test_files]
print(f'Length of test files is {len(test_files)}')
train_folder = '../data/raw_splits/train/'
train_files = sorted([x for x in os.listdir(train_folder) if not '.pkl' in x], key = lambda x: int(re.sub('[^0-9]', '', x)))
train_files = [os.path.join(train_folder, x) for x in train_files]
print(f'Length of train files is {len(train_files)}')

Length of test files is 26
Length of train files is 51


In [4]:
cat_feats = ['cat_new_ip', 'cat_new_prov', 'op_type', 'relative', 'cdf_s_127', 'cdf_s_135',
             'cdf_s_130', 'cdf_s_129', 'cdf_s_134', 'cdf_s_133', 'know_recip_card_age', 'one_region']
num_feats = ['amount', 'client_age', 'age_diff', 'cumulative_sum_total', 'cumulative_sum_total', 'data_i_120',
             'know_recip_power', 'data_i_120', 'recip_card_age', 'krp_pow2', 'log_amount']
feats = cat_feats + num_feats

In [5]:
target = data_train['label']
features = data_train[feats]

In [6]:
# как уже говорились при работе с данными, имеющими временной аспект крайне важно отсекать транзакции,
# идущие после оцениваемого события

# кроме того зачастую бывает полезно создавать фичи не за весь доступный горизонт данных,
# а за различные временные окна


def timeframe_trans(
    trans, 
    date_threshold=None,
    depth=None):
    """
    Функция отбирающая тразакции в заданном временном окне
    :param trans: весь массив транзакций
    :param date_threshold: дата и время отсечки, если не указан, то datetime.now()
    :param depth: глубина периода от даты отсечки, если не указан, то от текущего момента
    :return:

    """
    if date_threshold and depth:
        interested_trans = trans[trans.event_time.between(
            date_threshold - depth, date_threshold)]
    elif date_threshold and depth is None:
        interested_trans = trans[trans.event_time <= date_threshold]
    elif date_threshold is None and depth:
        interested_trans = trans[trans.event_time.between(
            datetime.datetime.now() - depth, datetime.datetime.now())]
    else:
        interested_trans = trans

    return interested_trans


def calc_quants_per_user(
    trans, 
    col_name='amount_original', 
    quants=np.arange(0.1, 1.1, .1), 
    feat_name='amount_q_', 
    depth=None, 
    start_date=datetime.date(2017, 10, 29),
    end_date=datetime.date(2017, 11, 29)):
    """
    Функция рассчитывающая квантили для пользователя с округлением до дня
    """

    
    time_delta = end_date - start_date
    # можно переопределить и выбрать другой временной интервал
    list_of_days = [
        start_date + datetime.timedelta(days=i) for i in range(time_delta.days + 1)]

    rez_df = pd.DataFrame()
    
    for day in list_of_days:
        if depth is None:
            cur_cutoff = trans[trans.event_time < day]
        else:
            cur_cutoff = trans[(trans.event_time < day) & (
                trans.event_time > (day - depth))]

        if cur_cutoff.shape[0] >= 1:
            res = cur_cutoff[col_name].quantile(quants)
        else:
            res = [0 for i in quants]
        for i in zip(quants, res):
            rez_df.loc[day, feat_name + str(i[0])] = i[1]

    return rez_df


def all_usr_quants(
    users_list,
    chunks_names,
    number_of_splits=900,
    col_name='amount_original',
    quants=np.arange(0.1, 1.1, .1),
    feat_name='amount_q_',
    depth=None,
    start_date=datetime.date(2017, 10, 29),
    end_date=datetime.date(2017, 11, 29)):
    """
    Расчет квантилей по всем пользователям со сплитованием на подмножества
    """
    rez_df = pd.DataFrame()
    usr_chunks = np.array_split(users_list, number_of_splits)

    for chunk in tqdm.tqdm_notebook(usr_chunks):
        trans_data = load_data(chunks_names, fields = ['user_id', 'event_time', col_name],
                               query = f"user_id in ({str(users_list)[1:-1]})")
#         trans_data = (df_from_sql("select user_id, event_time, {col} from rsa_event_log where user_id in ({users})"
#                                   .format(col = col_name, users=str(list(chunk))[1:-1]), parse_dates=['event_time']))
        chunk_df =  trans_data.groupby('user_id').apply(lambda x: 
                        # В даннос случае x - dataframe с транзакциями 1ого пользователя
                        calc_quants_per_user(x, col_name=col_name, feat_name=feat_name, quants=quants, depth = depth, start_date=start_date, end_date=end_date)
                    )
        rez_df = pd.concat([rez_df, chunk_df])
        
    return rez_df

In [7]:
import numpy as np
import pandas as pd
# import datetime


def transform_cols (df, dict_col_types = None):
    # Расширяйте для необходимых столбцов и их явной типизации
    if dict_col_types is None:
        dict_col_types = {
        'amount_original':(float, 0.0),
        'channel_indicator_desc':(str, u'null'),
        'event_description':(str, u'null'),
        'short_date':(int, 0),
        'cdf_s_20':(str, u'null'),
        'cdf_s_126':(str, u'null'),
        'cdf_s_127':(int, 30),
        'cdf_s_129':(int, 30),
        'cdf_s_138':(str, u'null'),
        'cdf_s_130':(int, 30),
        'cdf_s_133':(int, 30),
        'cdf_s_134':(int, 30),
        'cdf_s_135':(int, 30),
        'cdf_s_140':(float, 0.0),
        'cdf_s_218':(str, u'null'),
        'cdf_s_294':(int, 0),
        'cdf_s_299':(str, u'null'),
        'data_s_65':(int, 0),
        'data_i_120':(int, 0),
        'data_i_154':(float, -150)
        }
                
    if df.shape[0] == 0:
        return df
    
    df.replace(u'null', np.nan, inplace=True)

    for i in dict_col_types:
        if i in df.columns:
            change_type, fill_value = dict_col_types[i]
            df[i] = df[i].fillna(fill_value).astype(change_type)
    
    return df

def load_data(chunk_fnames, fields=None, query=None, sample='train', dict_col_types=None):
    df = pd.DataFrame({})
    if isinstance(chunk_fnames, str):
        chunk_fnames = [chunk_fnames]
        
    for filename in tqdm.tqdm_notebook(chunk_fnames):
        chunk_df = pd.read_feather(filename)
            
        if fields is None:
            fields = chunk_df.columns.tolist()
        
        transormed = transform_cols(chunk_df)
        
        if query:
            transormed = transormed.query(query)
 
        df = pd.concat([df, transormed[fields]], ignore_index=True)
    return df

In [8]:
def get_data(chunk_name):
    chunk_df = pd.read_feather(chunk_name)[['user_id', 'event_time',
                                           'amount_original',
                                           'channel_indicator_desc']]
    return chunk_df

In [9]:
import multiprocessing

In [175]:
%%time
# create features matrix
with multiprocessing.Pool(processes=min(1, FIRST_N)) as pool:
    results = pool.map(get_data, train_files)

CPU times: user 28 s, sys: 16 s, total: 44 s
Wall time: 7min 28s


In [177]:
quantile_df = pd.DataFrame()
for i, df in enumerate(results):
    quantile_df = pd.concat([quantile_df, df])

In [182]:
quantile_df = quantile_df[quantile_df['amount_original'].values != "null"]

In [185]:
quantile_df[:5]

Unnamed: 0,user_id,event_time,amount_original,channel_indicator_desc
0,100197,2017-08-25 22:47:42,10000.0,MOBILEAPI
2,100197,2017-05-07 09:46:44,1600.0,MOBILEAPI
4,100197,2017-09-13 14:30:56,18200.0,MOBILEAPI
5,100197,2017-10-07 16:14:31,568.0,MOBILEAPI
6,100197,2017-08-03 10:46:32,8900.0,MOBILEAPI


In [9]:
# quantile_df.to_csv("../data/coms_sep/for_quantiles.csv")
prequants = pd.read_csv("../data/coms_sep/for_quantiles.csv", index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [10]:
prequants[:5]

Unnamed: 0,user_id,event_time,amount_original,channel_indicator_desc
0,100197,2017-08-25 22:47:42,10000.0,MOBILEAPI
2,100197,2017-05-07 09:46:44,1600.0,MOBILEAPI
4,100197,2017-09-13 14:30:56,18200.0,MOBILEAPI
5,100197,2017-10-07 16:14:31,568.0,MOBILEAPI
6,100197,2017-08-03 10:46:32,8900.0,MOBILEAPI


In [11]:
prequants['event_time'] = pd.to_datetime(prequants['event_time'])
prequants['user_id'] = prequants['user_id'].astype(str)

In [12]:
prequants.set_index('user_id', inplace=True)
prequants.sort_index(inplace=True)

In [18]:
users = np.unique(prequants.index).tolist()
train_users = np.unique(data_train.user_id.astype(str)).tolist()
users = np.array(users)[np.in1d(users, train_users)].tolist()

In [85]:
%%time
prequants.loc[users[90000]][:3]

CPU times: user 1.13 ms, sys: 0 ns, total: 1.13 ms
Wall time: 1.06 ms


Unnamed: 0_level_0,event_time,amount_original,channel_indicator_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
48190236,2017-08-31 08:55:13,500.0,MOBILEAPI
48190236,2017-11-12 16:41:15,3000.0,MOBILEAPI
48190236,2017-07-09 10:47:39,44.51,MOBILEAPI


In [88]:
%%time
calc_quants_per_user(prequants.loc[[users[90000]]])[:3]

CPU times: user 1.38 s, sys: 102 µs, total: 1.38 s
Wall time: 1.37 s


Unnamed: 0,amount_q_0.1,amount_q_0.2,amount_q_0.30000000000000004,amount_q_0.4,amount_q_0.5,amount_q_0.6,amount_q_0.7000000000000001,amount_q_0.8,amount_q_0.9,amount_q_1.0
2017-10-29,100.0,100.0,100.0,200.0,200.0,300.0,600.0,2000.0,5000.0,10000.0
2017-10-30,100.0,100.0,100.0,200.0,200.0,300.0,600.0,2000.0,5000.0,10000.0
2017-10-31,100.0,100.0,100.0,200.0,200.0,300.0,950.0,2000.0,5000.0,10000.0


Весьма интересно, если в loc передать лист, а не строку, то время увеличивается на порядок.

In [99]:
days=30
depth = datetime.timedelta(days=days)
col_name = 'amount_original'
feat_name = 'amount_q_' + str(days) + "_"
quants = np.arange(0.1, 1.1, 0.1)
start_date = datetime.date(2017, 10, 29)
end_date = datetime.date(2017, 11, 29)

In [100]:
def mini_quant(x):
    out = calc_quants_per_user(
       x,
       col_name=col_name,
       feat_name=feat_name,
       quants=quants,
       depth = depth,
       start_date=start_date,
       end_date=end_date)
    return out

In [122]:
def get_qunats(users_id):
    try:
        out = prequants.loc[users_id].groupby(level=0).apply(mini_quant)
    except:
        out = prequants[:0]
    return out

In [123]:
user_sample = np.array(users)[np.random.choice(range(len(users)), replace=False, size=100)].tolist()

In [124]:
%%time
# create features matrix
with multiprocessing.Pool(processes=25) as pool:
    results_30 = pool.map(get_qunats, users)

CPU times: user 3min 13s, sys: 13.2 s, total: 3min 27s
Wall time: 57min 48s


In [129]:
tmp = results_30[0]

In [None]:
pd.concat([quan_30_df, df])

In [132]:
%%time
quan_30_df = pd.DataFrame()
for i, df in enumerate(results_30):
    quan_30_df = pd.concat([quan_30_df, df.reset_index()])

KeyboardInterrupt: 

In [125]:
len(results_30)

185500

In [341]:
days=90
depth = datetime.timedelta(days=days)
col_name = 'amount_original'
feat_name = 'amount_q_' + str(days) + "_"
quants = np.arange(0.1, 1.1, 0.1)
start_date = datetime.date(2017, 10, 29)
end_date = datetime.date(2017, 11, 29)

In [339]:
%%time
get_qunats(user_splits[1])

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  


CPU times: user 11.4 s, sys: 64.8 ms, total: 11.5 s
Wall time: 11.4 s


Unnamed: 0_level_0,Unnamed: 1_level_0,amount_q_90_0.1,amount_q_90_0.2,amount_q_90_0.30000000000000004,amount_q_90_0.4,amount_q_90_0.5,amount_q_90_0.6,amount_q_90_0.7000000000000001,amount_q_90_0.8,amount_q_90_0.9,amount_q_90_1.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
10019205,2017-10-29,50.00,86.00,102.000,170.000,250.000,370.000,616.200,1000.000,1400.000,6000.0
10019205,2017-10-30,50.00,86.00,102.000,170.000,250.000,370.000,616.200,1000.000,1400.000,6000.0
10019205,2017-10-31,50.00,86.00,102.000,170.000,250.000,370.000,530.000,968.000,1400.000,6000.0
10019205,2017-11-01,50.00,86.00,102.000,170.000,250.000,370.000,530.000,968.000,1400.000,6000.0
10019205,2017-11-02,50.00,70.00,102.000,170.000,250.000,340.000,560.000,976.000,1427.964,6000.0
10019205,2017-11-03,50.00,70.00,102.000,170.000,250.000,327.000,500.000,960.000,1400.000,6000.0
10019205,2017-11-04,50.00,70.00,102.000,170.000,250.000,327.000,500.000,960.000,1400.000,6000.0
10019205,2017-11-05,50.00,72.00,102.000,158.000,248.500,316.200,500.000,948.000,1390.000,6000.0
10019205,2017-11-06,50.00,70.00,102.000,162.000,247.000,310.800,500.000,912.000,1360.000,6000.0
10019205,2017-11-07,50.00,70.00,102.000,150.000,225.000,316.200,500.000,900.000,1290.000,6000.0


In [342]:
%%time
# create features matrix
with multiprocessing.Pool(processes=10) as pool:
    results_90 = pool.map(get_qunats, user_splits)













Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-rein

KeyError: "None of [['26047589', '26047743', '26047808', '26048225', '26048328', '26048689', '26049060', '2604910', '26049574', '26049646', '26049990', '2605012', '26050123', '2605022', '2605032', '26050589', '260507', '26050846', '26051020', '26051178', '26051379', '26051660', '26051933', '26052190', '26053146', '26053272', '26053609', '26053824', '26054507', '2605459', '26055753', '26055763', '26056265', '260563', '2605702', '26057377', '26057700', '26057713', '2605843', '26059048', '26059893', '26060908', '26061089', '26061797', '26061985', '26062013', '26062169', '26062509', '26063153', '26064245']] are in the [index]"

In [None]:
np.in1