In [4]:
import numpy as np 
import pandas as pd 
import warnings
import time
import sys
import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
warnings.simplefilter(action='ignore')
import gc
import dateutil.relativedelta

pd.set_option('display.width',None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_info_columns', 200)

In [5]:
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2    
#     for col in df.columns:
#         col_type = df[col].dtypes
#         if col_type in numerics:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
#                     df[col] = df[col].astype(np.int64)  
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)    
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
#     return df

In [6]:
historical_transactions = pd.read_csv('../input/historical_transactions.csv', parse_dates=['purchase_date'])
new_transactions = pd.read_csv('../input/new_merchant_transactions.csv', parse_dates=['purchase_date'])

In [7]:
np.percentile(historical_transactions['purchase_amount'], [1, 5, 50, 95, 99])

array([-0.74324133, -0.73819242, -0.68834948, -0.16861249,  1.22084097])

In [8]:
np.percentile(new_transactions['purchase_amount'], [1, 5, 50, 95, 99])

array([-0.74239984, -0.73638924, -0.67484064, -0.08574128,  1.4628174 ])

In [9]:
# historical_transactions['purchase_amount'] = historical_transactions['purchase_amount'].apply(lambda x: min(x, 1.5))

In [10]:
# new_transactions['purchase_amount'] = new_transactions['purchase_amount'].apply(lambda x: min(x, 1.5))

In [11]:
historical_transactions['category_2'] = historical_transactions['category_2'].fillna(-1, )
new_transactions['category_2'] = new_transactions['category_2'].fillna(-1, )

historical_transactions['category_3'] = historical_transactions['category_3'].map({'A':0, 'B':1, 'C':2})
new_transactions['category_3'] = new_transactions['category_3'].map({'A':0, 'B':1, 'C':2})

historical_transactions['category_3'] = historical_transactions['category_3'].fillna(-1, )
new_transactions['category_3'] = new_transactions['category_3'].fillna(-1, )

historical_transactions['merchant_id'] = historical_transactions['merchant_id'].fillna('-1', )
new_transactions['merchant_id'] = new_transactions['merchant_id'].fillna('-1', )

In [12]:
historical_transactions['category_3'].unique()

array([ 0.,  1.,  2., -1.])

In [13]:
historical_transactions['purchase_date'].max()

Timestamp('2018-02-28 23:59:51')

In [14]:
historical_transactions['purchase_date'].min()

Timestamp('2017-01-01 00:00:08')

In [15]:
new_transactions['purchase_date'].max()

Timestamp('2018-04-30 23:59:59')

In [16]:
new_transactions['purchase_date'].min()

Timestamp('2017-03-01 03:24:51')

In [19]:
%%time
def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    
    return df

train = read_data('../input/train.csv')
test = read_data('../input/test.csv')

target = train['target']

gc.collect()

CPU times: user 453 ms, sys: 68 µs, total: 453 ms
Wall time: 452 ms


In [20]:
test[test['card_id']=='C_ID_c27b4f80f7']

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
11578,NaT,C_ID_c27b4f80f7,5,2,1


In [21]:
# tmp = historical_transactions[historical_transactions['card_id']=='C_ID_c27b4f80f7']['purchase_date'].min() 
test['first_active_month'][test['card_id']=='C_ID_c27b4f80f7'] = pd.to_datetime(datetime.date(2017, 3, 1))

In [22]:
# historical_transactions = historical_transactions.head(200)
# new_transactions = new_transactions.head(200)
# train = train.head(200)
# test = test.head(200)

In [23]:
def sub_month_lag(x):
    return x['purchase_date_first'] - dateutil.relativedelta.relativedelta(months=x['month_lag_first'])

def a2p(a, p):
    return (a.dt.date - p.dt.date).dt.days

# def a2r(a, r):
#     return (a.dt.year-r.dt.year)*12 + (a.dt.month - r.dt.month)

In [24]:
def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

historical_transactions = binarize(historical_transactions)
new_transactions = binarize(new_transactions)

In [25]:
agg_fun = {
    'authorized_flag': ['mean'], 
}
auth_mean = historical_transactions.groupby(['card_id']).agg(agg_fun)
auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns.values]
auth_mean.reset_index(inplace=True)

train = pd.merge(train, auth_mean, on='card_id', how='left')
test = pd.merge(test, auth_mean, on='card_id', how='left')

In [26]:
# historical_transactions.count()

In [27]:
# historical_transactions = historical_transactions[historical_transactions['authorized_flag'] == 1]

In [28]:
# historical_transactions.count()

In [29]:
a_train = train[['card_id', 'first_active_month']]
a_test = test[['card_id', 'first_active_month']]

In [30]:
a_tmp = pd.concat([a_train,a_test]).drop_duplicates().reset_index(drop=True)

In [31]:
historical_transactions = pd.merge(historical_transactions, a_tmp, on='card_id', how='left')
new_transactions = pd.merge(new_transactions, a_tmp, on='card_id', how='left')

In [32]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,first_active_month
0,1,C_ID_4e6213e9bc,88,0,0,0.0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,2016-06-01
1,1,C_ID_4e6213e9bc,88,0,0,0.0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,2016-06-01
2,1,C_ID_4e6213e9bc,88,0,0,0.0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,2016-06-01
3,1,C_ID_4e6213e9bc,88,0,0,0.0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,2016-06-01
4,1,C_ID_4e6213e9bc,88,0,0,0.0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,2016-06-01


In [33]:
historical_transactions['purchase_date'] = pd.to_datetime(historical_transactions['purchase_date'])
new_transactions['purchase_date'] = pd.to_datetime(new_transactions['purchase_date'])

In [34]:
def get_r(history):
    agg_func = {
        'month_lag': ['first', ],
        'purchase_date': ['first', ],
        }
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    agg_history['reference_date'] = agg_history.apply(sub_month_lag, axis=1)
    agg_history['reference_date'] = agg_history['reference_date'].apply(lambda x: x + dateutil.relativedelta.relativedelta(day=1, months=+1, days=-1))
    agg_history['now_date'] = agg_history['reference_date'].apply(lambda x: x + dateutil.relativedelta.relativedelta(day=1, months=+5, days=-1))

    agg_history.drop(columns=['month_lag_first', 'purchase_date_first'], inplace=True)

    return agg_history

In [35]:
hist_r = get_r(historical_transactions)

In [36]:
hist_r.head()

Unnamed: 0,card_id,reference_date,now_date
0,C_ID_00007093c1,2018-02-28 19:09:42,2018-06-30 19:09:42
1,C_ID_0001238066,2018-02-28 12:14:16,2018-06-30 12:14:16
2,C_ID_0001506ef0,2018-02-28 08:25:45,2018-06-30 08:25:45
3,C_ID_0001793786,2017-10-31 15:13:07,2018-02-28 15:13:07
4,C_ID_000183fdda,2018-02-28 22:13:09,2018-06-30 22:13:09


In [37]:
historical_transactions = pd.merge(historical_transactions, hist_r, on='card_id', how='left')
new_transactions = pd.merge(new_transactions, hist_r, on='card_id', how='left')

In [38]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,first_active_month,reference_date,now_date
0,1,C_ID_4e6213e9bc,88,0,0,0.0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07
1,1,C_ID_4e6213e9bc,88,0,0,0.0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07
2,1,C_ID_4e6213e9bc,88,0,0,0.0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07
3,1,C_ID_4e6213e9bc,88,0,0,0.0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07
4,1,C_ID_4e6213e9bc,88,0,0,0.0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07


In [39]:
for df in [historical_transactions, new_transactions]:
    df['a2p'] = a2p(df['first_active_month'], df['purchase_date'])
    
    df['p2r'] = a2p(df['purchase_date'], df['reference_date'])
    df['p2now'] = a2p(df['purchase_date'], df['now_date'])
    
#     df['p_and_m'] = df['purchase_amount'] * (df['month_lag'].abs()+1)
    df['p_vs_m'] = df['purchase_amount'] / (df['month_lag'].abs()+1)
    
    df["installments"].replace(-1, np.NaN, inplace=True)
    df["installments"].replace(999, np.NaN, inplace=True)
    
#     df['p_and_i'] = df['purchase_amount'] * (df['installments'].abs()+1)
    df['p_vs_i'] = df['purchase_amount'] / (df['installments'].abs()+1)

#     df['year'] = df['purchase_date'].dt.year
    df['quarter'] = df['purchase_date'].dt.quarter

    df['month'] = df['purchase_date'].dt.month
    
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['day'] = df['purchase_date'].dt.day
    df['hour'] = df['purchase_date'].dt.hour

#     df['is_quarter_start'] = (df.purchase_date.dt.is_quarter_start).astype(int)
#     df['is_quarter_end'] = (df.purchase_date.dt.is_quarter_end).astype(int)  
    df['is_month_start'] = (df.purchase_date.dt.is_month_start).astype(int)
#     df['is_month_end'] = (df.purchase_date.dt.is_month_end).astype(int)  

    df['weekend'] = (df.purchase_date.dt.weekday>=5).astype(int)  

In [40]:
historical_transactions['a2p'].plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [41]:
new_transactions['a2p'].plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [42]:
historical_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,first_active_month,reference_date,now_date,a2p,p2r,p2now,p_vs_m,p_vs_i,quarter,month,weekofyear,dayofweek,day,hour,is_month_start,weekend
0,1,C_ID_4e6213e9bc,88,0,0.0,0.0,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07,-389,-248,-370,-0.078148,-0.703331,2,6,25,6,25,15,0,1
1,1,C_ID_4e6213e9bc,88,0,0.0,0.0,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07,-409,-228,-350,-0.091641,-0.733128,3,7,28,5,15,12,0,1
2,1,C_ID_4e6213e9bc,88,0,0.0,0.0,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07,-434,-203,-325,-0.102912,-0.720386,3,8,32,2,9,22,0,0
3,1,C_ID_4e6213e9bc,88,0,0.0,0.0,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07,-458,-179,-301,-0.122559,-0.735352,3,9,35,5,2,10,0,1
4,1,C_ID_4e6213e9bc,88,0,0.0,0.0,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37,2016-06-01,2018-02-28 15:33:07,2018-06-30 15:33:07,-282,-355,-477,-0.060239,-0.722865,1,3,10,4,10,1,0,0


In [43]:
def aggregate_transactions_hist(history_):
        
    agg_func = {
#         'is_quarter_start': ['sum', 'mean'],
#         'is_quarter_end': ['sum', 'mean'],
        'is_month_start': ['mean'],
#         'is_month_end': ['sum', 'mean'],
        'weekend': ['mean'],
        'category_1': ['mean'],
        #
        
        'category_2': ['nunique', ], #
        'category_3': ['nunique', ], #
        'state_id': ['nunique', ],
        'city_id': ['nunique', ],
        'subsector_id': ['nunique', ],
        'merchant_category_id': ['nunique', ],
        'merchant_id': ['nunique', ],
#         'year': ['nunique', ],
        'quarter': ['nunique', ],
        'month': ['nunique', ], 
        'weekofyear': ['nunique', ],
        'dayofweek': ['nunique', ],
        'day': ['nunique', ],
        'hour': ['nunique', ],
        
        #
        'a2p': ['mean', 'median', 'max', 'min', 'std'], 
        
        'p2r': ['mean', 'median', 'max', 'min', 'std'], 
        'p2now': ['mean', 'median', 'max', 'min', 'std'],  ################
        
        'month_lag': ['mean', 'median', 'max', 'min', 'std'],
        'purchase_amount': ['sum', 'mean', 'median', 'max', 'min', 'std'], 
        'installments': ['sum', 'mean', 'median', 'max', 'min', 'std'], 
#         'p_and_m': ['mean', 'median', 'max', 'min', 'std'], 
        'p_vs_m': ['mean', 'median', 'max', 'min', 'std'],
#         'p_and_i': ['mean', 'median', 'max', 'min', 'std'], 
        'p_vs_i': ['mean', 'median', 'max', 'min', 'std'],
        'purchase_date': ['max', 'min'],
        'first_active_month': ['first'],
        'reference_date': ['first'],
        'now_date': ['first'],
        }
        
    history = history_
    
    for col in ['category_2', 'category_3', 
                'state_id', 'city_id', 'subsector_id', 'merchant_category_id', 'merchant_id', 
#                 'year', 
                'quarter', 
                'month', 'weekofyear', 
                'dayofweek',
                'day',
                'hour'
               ]:
        
#         freq_encode = history[col].value_counts(normalize=True)
#         history[col+'_freq'] = history[col].map(freq_encode)
        
        history[col+'_p_mean'] = history.groupby([col])['purchase_amount'].transform('mean')  # mean encode improve 0.697 to 0.694
#         history[col+'_i_mean'] = history.groupby([col])['installments'].transform('mean')

#         agg_func[col+'_freq'] = ['mean']

#         agg_func[col+'_p_mean'] = ['mean']
        agg_func[col+'_p_mean'] = ['mean', 'median', 'max', 'min', 'std']
#         agg_func[col+'_i_mean'] = ['mean', 'max', 'min', 'std']

    agg_history = history.groupby(['card_id']).agg(agg_func)
    
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
    
    agg_history['first_year'] = agg_history['first_active_month_first'].dt.year
    agg_history['first_quarter'] = agg_history['first_active_month_first'].dt.quarter
    agg_history['first_month'] = agg_history['first_active_month_first'].dt.month
    
    agg_history['re_year'] = agg_history['reference_date_first'].dt.year
    agg_history['re_quarter'] = agg_history['reference_date_first'].dt.quarter
    agg_history['re_month'] = agg_history['reference_date_first'].dt.month
    
    agg_history['now_year'] = agg_history['now_date_first'].dt.year
    agg_history['now_quarter'] = agg_history['now_date_first'].dt.quarter
    agg_history['now_month'] = agg_history['now_date_first'].dt.month
    
    agg_history['a2r'] = a2p(agg_history['first_active_month_first'], agg_history['reference_date_first'])
    agg_history['r2now'] = a2p(agg_history['reference_date_first'], agg_history['now_date_first'])
    agg_history['a2now'] = a2p(agg_history['first_active_month_first'], agg_history['now_date_first']) #############

    agg_history.drop(columns=['first_active_month_first', 'reference_date_first', 'now_date_first'], inplace=True)
    
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
    
    return agg_history

gc.collect()

33

In [44]:
%%time
history = aggregate_transactions_hist(historical_transactions)
    
history.columns = ['hist_' + c if c != 'card_id' else c for c in history.columns]

history['hist_p2p'] = (history['hist_purchase_date_max'] - history['hist_purchase_date_min']).dt.days
history['hist_sleep'] = history['hist_p2p'] - history['hist_day_nunique']

###
history['hist_p2p_vs_count'] = history['hist_p2p']/history['hist_transactions_count']
history['hist_sleep_vs_count'] = history['hist_sleep']/history['hist_transactions_count']
###
history['hist_count_vs_p2p'] = history['hist_transactions_count']/(history['hist_p2p'].abs()+1)
history['hist_sleep_vs_p2p'] = history['hist_sleep']/(history['hist_p2p'].abs()+1)
history['hist_p_vs_p2p'] = history['hist_purchase_amount_sum']/(history['hist_p2p'].abs()+1)

history['hist_i_vs_p2p'] = history['hist_installments_sum']/(history['hist_p2p'].abs()+1)

# history[:5]
gc.collect()

CPU times: user 5min 36s, sys: 28.5 s, total: 6min 5s
Wall time: 6min 4s


In [45]:
history.head()

Unnamed: 0,card_id,hist_transactions_count,hist_is_month_start_mean,hist_weekend_mean,hist_category_1_mean,hist_category_2_nunique,hist_category_3_nunique,hist_state_id_nunique,hist_city_id_nunique,hist_subsector_id_nunique,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_quarter_nunique,hist_month_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,hist_day_nunique,hist_hour_nunique,hist_a2p_mean,hist_a2p_median,hist_a2p_max,hist_a2p_min,hist_a2p_std,hist_p2r_mean,hist_p2r_median,hist_p2r_max,hist_p2r_min,hist_p2r_std,hist_p2now_mean,hist_p2now_median,hist_p2now_max,hist_p2now_min,hist_p2now_std,hist_month_lag_mean,hist_month_lag_median,hist_month_lag_max,hist_month_lag_min,hist_month_lag_std,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_median,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_mean,hist_installments_median,hist_installments_max,hist_installments_min,hist_installments_std,hist_p_vs_m_mean,hist_p_vs_m_median,hist_p_vs_m_max,hist_p_vs_m_min,hist_p_vs_m_std,hist_p_vs_i_mean,hist_p_vs_i_median,hist_p_vs_i_max,hist_p_vs_i_min,hist_p_vs_i_std,hist_purchase_date_max,hist_purchase_date_min,hist_category_2_p_mean_mean,hist_category_2_p_mean_median,hist_category_2_p_mean_max,hist_category_2_p_mean_min,hist_category_2_p_mean_std,hist_category_3_p_mean_mean,hist_category_3_p_mean_median,hist_category_3_p_mean_max,hist_category_3_p_mean_min,hist_category_3_p_mean_std,hist_state_id_p_mean_mean,hist_state_id_p_mean_median,hist_state_id_p_mean_max,hist_state_id_p_mean_min,hist_state_id_p_mean_std,hist_city_id_p_mean_mean,hist_city_id_p_mean_median,hist_city_id_p_mean_max,hist_city_id_p_mean_min,hist_city_id_p_mean_std,hist_subsector_id_p_mean_mean,hist_subsector_id_p_mean_median,hist_subsector_id_p_mean_max,hist_subsector_id_p_mean_min,hist_subsector_id_p_mean_std,hist_merchant_category_id_p_mean_mean,hist_merchant_category_id_p_mean_median,hist_merchant_category_id_p_mean_max,hist_merchant_category_id_p_mean_min,hist_merchant_category_id_p_mean_std,hist_merchant_id_p_mean_mean,hist_merchant_id_p_mean_median,hist_merchant_id_p_mean_max,hist_merchant_id_p_mean_min,hist_merchant_id_p_mean_std,hist_quarter_p_mean_mean,hist_quarter_p_mean_median,hist_quarter_p_mean_max,hist_quarter_p_mean_min,hist_quarter_p_mean_std,hist_month_p_mean_mean,hist_month_p_mean_median,hist_month_p_mean_max,hist_month_p_mean_min,hist_month_p_mean_std,hist_weekofyear_p_mean_mean,hist_weekofyear_p_mean_median,hist_weekofyear_p_mean_max,hist_weekofyear_p_mean_min,hist_weekofyear_p_mean_std,hist_dayofweek_p_mean_mean,hist_dayofweek_p_mean_median,hist_dayofweek_p_mean_max,hist_dayofweek_p_mean_min,hist_dayofweek_p_mean_std,hist_day_p_mean_mean,hist_day_p_mean_median,hist_day_p_mean_max,hist_day_p_mean_min,hist_day_p_mean_std,hist_hour_p_mean_mean,hist_hour_p_mean_median,hist_hour_p_mean_max,hist_hour_p_mean_min,hist_hour_p_mean_std,hist_first_year,hist_first_quarter,hist_first_month,hist_re_year,hist_re_quarter,hist_re_month,hist_now_year,hist_now_quarter,hist_now_month,hist_a2r,hist_r2now,hist_a2now,hist_p2p,hist_sleep,hist_p2p_vs_count,hist_sleep_vs_count,hist_count_vs_p2p,hist_sleep_vs_p2p,hist_p_vs_p2p,hist_i_vs_p2p
0,C_ID_00007093c1,149,0.067114,0.167785,0.187919,3,2,3,4,13,18,29,4,12,39,7,28,18,-197.838926,-185.0,-13,-391,104.216256,-194.161074,-207.0,-1,-379,104.216256,-316.161074,-329.0,-123,-501,104.216256,-5.852349,-6.0,0,-12,3.453114,-76.845041,-0.515739,-0.587627,1.507069,-0.728876,0.298141,192.0,1.288591,1.0,6.0,1.0,0.7649,-0.128244,-0.085235,0.215296,-0.721363,0.149488,-0.251965,-0.290808,0.376767,-0.364438,0.128764,2018-02-27 05:14:57,2017-02-14 14:00:43,0.077946,0.180375,0.180375,-0.354523,0.210037,-0.322315,-0.404556,0.106023,-0.404556,0.188321,0.166921,0.288618,0.288618,-0.354523,0.251892,0.289764,0.512332,0.512332,-0.359418,0.344451,-0.162991,-0.245685,2.180532,-0.678829,0.618114,-0.124108,-0.278704,7.143294,-0.698283,1.138669,-0.325462,-0.569692,10.689679,-0.699339,1.639868,0.191358,-0.118619,1.13676,-0.209863,0.585363,0.176675,-0.132023,3.939885,-0.273128,1.119046,1.242541,-0.141471,17.115643,-0.468301,4.717097,-0.144504,-0.207237,1.755822,-0.31751,0.332254,-0.074209,-0.140629,5.85962,-0.377722,0.706294,-0.180561,-0.245731,0.455703,-0.49118,0.205602,2017,1,2,2018,1,2,2018,2,6,-392,-122,-514,377,349,2.530201,2.342282,0.39418,0.92328,-0.203294,0.507937
1,C_ID_0001238066,123,0.00813,0.422764,0.01626,3,3,6,18,17,29,65,3,6,23,7,30,20,-112.243902,-113.0,-27,-179,38.533005,-67.756098,-67.0,-1,-153,38.533005,-189.756098,-189.0,-123,-275,38.533005,-1.813008,-2.0,0,-5,1.28898,-72.447201,-0.589002,-0.648184,0.768095,-0.734887,0.190235,201.0,1.675,1.0,10.0,1.0,1.444564,-0.276554,-0.216983,0.384048,-0.728681,0.198098,-0.27045,-0.321857,0.109728,-0.367443,0.104074,2018-02-27 16:18:59,2017-09-28 22:25:14,0.075478,0.14957,0.14957,-0.354523,0.147333,-0.26043,-0.404556,0.106023,-0.404556,0.22958,0.175225,0.317017,0.317017,-0.354523,0.255755,0.003668,-0.162348,3.606916,-0.600283,0.625634,-0.082208,-0.245685,2.180532,-0.489419,0.63944,-0.117235,-0.278704,7.143294,-0.49024,0.842623,-0.548661,-0.631314,1.963618,-0.729574,0.309966,-0.188681,-0.175738,-0.118619,-0.209863,0.019022,-0.185765,-0.155056,-0.12716,-0.273128,0.061335,-0.19554,-0.186812,0.102304,-0.41125,0.149304,0.124096,-0.078892,1.755822,-0.31751,0.683614,0.35416,-0.136731,5.85962,-0.377722,1.650578,-0.199263,-0.274255,0.455703,-0.49118,0.215156,2017,3,9,2018,1,2,2018,2,6,-180,-122,-302,151,121,1.227642,0.98374,0.809211,0.796053,-0.476626,1.322368
2,C_ID_0001506ef0,66,0.015152,0.484848,0.0,2,2,2,3,12,19,28,4,11,24,7,25,15,-442.909091,-498.0,-197,-596,128.642107,-164.090909,-109.0,-11,-410,128.642107,-286.090909,-231.0,-133,-532,128.642107,-4.833333,-3.0,0,-13,4.2375,-34.601879,-0.524271,-0.703707,1.493545,-0.740491,0.472284,1.0,0.015152,0.0,1.0,0.0,0.123091,-0.145832,-0.148056,1.493545,-0.716855,0.290651,-0.518903,-0.701077,1.493545,-0.740491,0.472171,2018-02-17 12:33:56,2017-01-14 16:16:01,0.179442,0.180375,0.180375,0.14957,0.005321,0.350313,0.361926,0.361926,-0.404556,0.094347,0.443202,0.447146,0.447146,0.317017,0.022477,-0.131357,-0.166294,0.87204,-0.166294,0.180948,-0.16275,-0.263905,2.180532,-0.678829,0.52026,-0.050185,-0.321717,3.683839,-0.690491,0.918747,-0.549813,-0.606789,-0.122304,-0.714471,0.154936,-0.105477,-0.175738,1.13676,-0.209863,0.31906,-0.178745,-0.155056,-0.035313,-0.273128,0.061421,-0.184669,-0.23409,0.340314,-0.468301,0.157361,-0.024851,-0.078892,1.755822,-0.31751,0.463745,0.095709,-0.136731,5.85962,-0.377722,1.274332,-0.09729,-0.079571,0.455703,-0.416907,0.166919,2016,3,7,2018,1,2,2018,2,6,-607,-122,-729,398,373,6.030303,5.651515,0.165414,0.934837,-0.086722,0.002506
3,C_ID_0001793786,216,0.027778,0.171296,0.009259,4,2,4,10,24,48,119,4,10,33,7,31,21,-186.634259,-203.0,-20,-303,70.951936,-116.365741,-100.0,0,-283,70.951936,-236.365741,-220.0,-120,-403,70.951936,-3.328704,-3.0,0,-9,2.306373,-36.786013,-0.170306,-0.487911,4.554145,-0.745405,0.836046,5.0,0.023148,0.0,1.0,0.0,0.150723,-0.067379,-0.099693,2.215434,-0.724518,0.316355,-0.162449,-0.464485,4.554145,-0.737892,0.832714,2017-10-31 20:20:18,2017-01-21 10:15:21,-0.225265,-0.354523,0.180375,-0.354523,0.169652,0.344183,0.361926,0.361926,-0.404556,0.115527,-0.262957,-0.354523,0.347767,-0.354523,0.22197,-0.192103,-0.203658,0.87204,-0.572273,0.330844,0.019453,-0.231083,2.180532,-0.678829,0.53832,-0.074063,-0.278704,7.449148,-0.690491,1.000189,-0.520178,-0.604636,1.788496,-0.73767,0.286119,0.278602,-0.118619,1.13676,-0.209863,0.602622,0.42117,-0.132023,3.939885,-0.246819,1.417095,0.653003,-0.225431,17.115643,-0.468301,3.641753,-0.136587,-0.156621,1.755822,-0.31751,0.275264,0.037457,-0.146322,5.85962,-0.377722,1.074445,-0.101952,-0.245731,26.71146,-0.49118,1.842569,2017,1,1,2017,4,10,2018,1,2,-303,-120,-423,283,252,1.310185,1.166667,0.760563,0.887324,-0.129528,0.017606
4,C_ID_000183fdda,144,0.090278,0.229167,0.027778,5,3,7,9,21,36,73,3,7,27,7,30,19,-90.666667,-98.5,25,-177,55.937403,-89.333333,-81.5,-3,-205,55.937403,-211.333333,-203.5,-125,-327,55.937403,-2.451389,-2.0,0,-6,1.895264,-68.837938,-0.478041,-0.661294,2.764788,-0.737892,0.524948,268.0,1.914286,1.0,10.0,1.0,2.093105,-0.208121,-0.171625,1.382394,-0.731881,0.317779,-0.249072,-0.333213,0.468031,-0.368946,0.164483,2018-02-25 20:57:08,2017-08-07 09:49:14,0.159638,0.180375,0.180375,-0.354523,0.094982,-0.260505,-0.404556,0.106023,-0.404556,0.229242,-0.349108,-0.398643,0.347767,-0.398643,0.172264,-0.416594,-0.51265,0.87204,-0.51265,0.316388,-0.033192,-0.231083,2.180532,-0.489419,0.437752,-0.072763,-0.278704,3.683839,-0.690491,0.629339,-0.504566,-0.605047,1.393186,-0.721197,0.317104,-0.177902,-0.175738,-0.118619,-0.209863,0.033017,-0.166872,-0.135695,-0.121147,-0.273128,0.052208,-0.158039,-0.152121,0.196065,-0.384181,0.139945,-0.007413,-0.156621,1.755822,-0.31751,0.540797,0.000417,-0.152958,5.85962,-0.377722,1.001908,-0.243994,-0.32213,0.455703,-0.49118,0.218348,2017,3,9,2018,1,2,2018,2,6,-180,-122,-302,202,172,1.402778,1.194444,0.70936,0.847291,-0.339103,1.320197


In [46]:
train = pd.merge(train, history, on='card_id', how='left')
test = pd.merge(test, history, on='card_id', how='left')
del history; gc.collect()

14

In [47]:
def aggregate_transactions_new(history_):
        
    agg_func = {
#         'is_quarter_start': ['sum', 'mean'],
#         'is_quarter_end': ['sum', 'mean'],
        'is_month_start': ['mean'],
#         'is_month_end': ['sum', 'mean'],
        'weekend': ['mean'],
        'category_1': ['mean'],
        #
        
        'category_2': ['nunique', ], #
        'category_3': ['nunique', ], #
        'state_id': ['nunique', ],
        'city_id': ['nunique', ],
        'subsector_id': ['nunique', ],
        'merchant_category_id': ['nunique', ],
        'merchant_id': ['nunique', ],
#         'year': ['nunique', ],
        'quarter': ['nunique', ],
        'month': ['nunique', ], 
        'weekofyear': ['nunique', ],
        'dayofweek': ['nunique', ],
        'day': ['nunique', ],
        'hour': ['nunique', ],
        
        #
        'a2p': ['mean', 'median', 'max', 'min', 'std'],  ################
        
        'p2r': ['mean', 'median', 'max', 'min', 'std'], 
        'p2now': ['mean', 'median', 'max', 'min', 'std'], 
        
        'month_lag': ['mean', 'median', 'max', 'min', 'std'],
        'purchase_amount': ['sum', 'mean', 'median', 'max', 'min', 'std'], # 
        'installments': ['sum', 'mean', 'median', 'max', 'min', 'std'], # 
#         'p_and_m': ['mean', 'median', 'max', 'min', 'std'], # 
        'p_vs_m': ['mean', 'median', 'max', 'min', 'std'], # 
#         'p_and_i': ['mean', 'median', 'max', 'min', 'std'], # 
        'p_vs_i': ['mean', 'median', 'max', 'min', 'std'], # 
        'purchase_date': ['max', 'min'],
        }
        
    history = history_
    
    for col in ['category_2', 'category_3', 
                'state_id', 'city_id', 'subsector_id', 'merchant_category_id', 'merchant_id', 
#                 'year', 
                'quarter', 
                'month', 'weekofyear', 
                'dayofweek',
                'day',
                'hour'
               ]:
        
#         freq_encode = history[col].value_counts(normalize=True)
#         history[col+'_freq'] = history[col].map(freq_encode)
        
        history[col+'_p_mean'] = history.groupby([col])['purchase_amount'].transform('mean')  # mean encode import 0.697 to 0.694
#         history[col+'_i_mean'] = history.groupby([col])['installments'].transform('mean')

#         agg_func[col+'_freq'] = ['mean']

#         agg_func[col+'_p_mean'] = ['mean']
        agg_func[col+'_p_mean'] = ['mean', 'median', 'max', 'min', 'std']
#         agg_func[col+'_i_mean'] = ['mean', 'max', 'min', 'std']

    agg_history = history.groupby(['card_id']).agg(agg_func)
    
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    agg_history.reset_index(inplace=True)
        
    df = (history.groupby('card_id')
          .size()
          .reset_index(name='transactions_count'))
    
    agg_history = pd.merge(df, agg_history, on='card_id', how='left')
        
    return agg_history

gc.collect()

0

In [48]:
%%time
new = aggregate_transactions_new(new_transactions)
    
new.columns = ['new_' + c if c != 'card_id' else c for c in new.columns]
    
new['new_p2p'] = (new['new_purchase_date_max'] - new['new_purchase_date_min']).dt.days
new['new_sleep'] = new['new_p2p'] - new['new_day_nunique']
###
new['new_p2p_vs_count'] = new['new_p2p']/new['new_transactions_count']
new['new_sleep_vs_count'] = new['new_sleep']/new['new_transactions_count']
###
new['new_count_vs_p2p'] = new['new_transactions_count']/(new['new_p2p'].abs()+1)
new['new_sleep_vs_p2p'] = new['new_sleep']/(new['new_p2p'].abs()+1)
new['new_p_vs_p2p'] = new['new_purchase_amount_sum']/(new['new_p2p'].abs()+1)

new['new_i_vs_p2p'] = new['new_installments_sum']/(new['new_p2p'].abs()+1)

# new[:5]
gc.collect()

CPU times: user 29.7 s, sys: 499 ms, total: 30.2 s
Wall time: 30.2 s


In [49]:
new.head()

Unnamed: 0,card_id,new_transactions_count,new_is_month_start_mean,new_weekend_mean,new_category_1_mean,new_category_2_nunique,new_category_3_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_merchant_category_id_nunique,new_merchant_id_nunique,new_quarter_nunique,new_month_nunique,new_weekofyear_nunique,new_dayofweek_nunique,new_day_nunique,new_hour_nunique,new_a2p_mean,new_a2p_median,new_a2p_max,new_a2p_min,new_a2p_std,new_p2r_mean,new_p2r_median,new_p2r_max,new_p2r_min,new_p2r_std,new_p2now_mean,new_p2now_median,new_p2now_max,new_p2now_min,new_p2now_std,new_month_lag_mean,new_month_lag_median,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_median,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_median,new_installments_max,new_installments_min,new_installments_std,new_p_vs_m_mean,new_p_vs_m_median,new_p_vs_m_max,new_p_vs_m_min,new_p_vs_m_std,new_p_vs_i_mean,new_p_vs_i_median,new_p_vs_i_max,new_p_vs_i_min,new_p_vs_i_std,new_purchase_date_max,new_purchase_date_min,new_category_2_p_mean_mean,new_category_2_p_mean_median,new_category_2_p_mean_max,new_category_2_p_mean_min,new_category_2_p_mean_std,new_category_3_p_mean_mean,new_category_3_p_mean_median,new_category_3_p_mean_max,new_category_3_p_mean_min,new_category_3_p_mean_std,new_state_id_p_mean_mean,new_state_id_p_mean_median,new_state_id_p_mean_max,new_state_id_p_mean_min,new_state_id_p_mean_std,new_city_id_p_mean_mean,new_city_id_p_mean_median,new_city_id_p_mean_max,new_city_id_p_mean_min,new_city_id_p_mean_std,new_subsector_id_p_mean_mean,new_subsector_id_p_mean_median,new_subsector_id_p_mean_max,new_subsector_id_p_mean_min,new_subsector_id_p_mean_std,new_merchant_category_id_p_mean_mean,new_merchant_category_id_p_mean_median,new_merchant_category_id_p_mean_max,new_merchant_category_id_p_mean_min,new_merchant_category_id_p_mean_std,new_merchant_id_p_mean_mean,new_merchant_id_p_mean_median,new_merchant_id_p_mean_max,new_merchant_id_p_mean_min,new_merchant_id_p_mean_std,new_quarter_p_mean_mean,new_quarter_p_mean_median,new_quarter_p_mean_max,new_quarter_p_mean_min,new_quarter_p_mean_std,new_month_p_mean_mean,new_month_p_mean_median,new_month_p_mean_max,new_month_p_mean_min,new_month_p_mean_std,new_weekofyear_p_mean_mean,new_weekofyear_p_mean_median,new_weekofyear_p_mean_max,new_weekofyear_p_mean_min,new_weekofyear_p_mean_std,new_dayofweek_p_mean_mean,new_dayofweek_p_mean_median,new_dayofweek_p_mean_max,new_dayofweek_p_mean_min,new_dayofweek_p_mean_std,new_day_p_mean_mean,new_day_p_mean_median,new_day_p_mean_max,new_day_p_mean_min,new_day_p_mean_std,new_hour_p_mean_mean,new_hour_p_mean_median,new_hour_p_mean_max,new_hour_p_mean_min,new_hour_p_mean_std,new_p2p,new_sleep,new_p2p_vs_count,new_sleep_vs_count,new_count_vs_p2p,new_sleep_vs_p2p,new_p_vs_p2p,new_i_vs_p2p
0,C_ID_00007093c1,2,0.0,0.0,0.0,2,1,2,2,2,2,2,1,1,2,2,2,2,-429.0,-429.0,-426,-432,4.242641,37.0,37.0,40,34,4.242641,-85.0,-85.0,-82,-88,4.242641,2.0,2.0,2,2,0.0,-1.328524,-0.664262,-0.664262,-0.656749,-0.671775,0.010625,2.0,1.0,1.0,1.0,1.0,0.0,-0.221421,-0.221421,-0.218916,-0.223925,0.003542,-0.332131,-0.332131,-0.328374,-0.335888,0.005313,2018-04-09 16:23:59,2018-04-03 11:13:35,-0.560047,-0.560047,-0.550852,-0.569242,0.013004,-0.606486,-0.606486,-0.606486,-0.606486,0.0,-0.560864,-0.560864,-0.557145,-0.564584,0.00526,-0.5553,-0.5553,-0.546606,-0.563993,0.012294,-0.457642,-0.457642,-0.365697,-0.549586,0.130029,-0.446722,-0.446722,-0.379453,-0.513992,0.095133,-0.612127,-0.612127,-0.552479,-0.671775,0.084355,-0.547688,-0.547688,-0.547688,-0.547688,0.0,-0.545173,-0.545173,-0.545173,-0.545173,0.0,-0.54655,-0.54655,-0.546106,-0.546995,0.000629,-0.533363,-0.533363,-0.532678,-0.534049,0.000969,-0.548294,-0.548294,-0.543283,-0.553305,0.007087,-0.530753,-0.530753,-0.527109,-0.534396,0.005153,6,4,3.0,2.0,0.285714,0.571429,-0.189789,0.285714
1,C_ID_0001238066,26,0.076923,0.461538,0.076923,3,3,4,8,9,15,26,2,2,9,6,14,16,-208.961538,-204.0,-181,-241,17.752703,28.961538,24.0,61,1,17.752703,-93.038462,-98.0,-61,-121,17.752703,1.346154,1.0,2,1,0.485165,-14.850055,-0.571156,-0.649235,-0.078318,-0.740897,0.173436,43.0,1.72,1.0,10.0,1.0,2.051828,-0.252919,-0.241448,-0.039159,-0.370449,0.09616,-0.272389,-0.328374,-0.01958,-0.370449,0.106812,2018-04-30 19:57:30,2018-03-01 16:48:27,-0.543855,-0.569242,-0.369453,-0.569242,0.064565,-0.482744,-0.606486,0.037708,-0.606486,0.258619,-0.542311,-0.564584,-0.369453,-0.584144,0.063803,-0.544066,-0.559198,-0.346173,-0.594003,0.075843,-0.603397,-0.615356,-0.49771,-0.658111,0.05564,-0.60543,-0.623829,-0.45045,-0.692982,0.052979,-0.571206,-0.586766,-0.227141,-0.709414,0.113192,-0.547728,-0.54775,-0.547688,-0.54775,3e-05,-0.547057,-0.548054,-0.545173,-0.548054,0.001398,-0.547513,-0.546873,-0.539581,-0.558738,0.005859,-0.560634,-0.54742,-0.532678,-0.600648,0.023691,-0.554606,-0.556371,-0.540449,-0.56823,0.010635,-0.560891,-0.558133,-0.527109,-0.596229,0.021618,60,46,2.307692,1.769231,0.42623,0.754098,-0.243444,0.704918
2,C_ID_0001506ef0,2,0.0,0.0,0.0,1,1,1,1,2,2,2,1,1,2,2,2,2,-626.0,-626.0,-623,-629,4.242641,19.0,19.0,22,16,4.242641,-103.0,-103.0,-100,-106,4.242641,1.0,1.0,1,1,0.0,-1.447354,-0.723677,-0.723677,-0.715352,-0.732001,0.011773,0.0,0.0,0.0,0.0,0.0,0.0,-0.361838,-0.361838,-0.357676,-0.366001,0.005886,-0.723677,-0.723677,-0.715352,-0.732001,0.011773,2018-03-22 09:14:30,2018-03-16 22:21:58,-0.550852,-0.550852,-0.550852,-0.550852,0.0,-0.631014,-0.631014,-0.631014,-0.631014,0.0,-0.559833,-0.559833,-0.559833,-0.559833,0.0,-0.580327,-0.580327,-0.580327,-0.580327,0.0,-0.651508,-0.651508,-0.628338,-0.674678,0.032767,-0.636352,-0.636352,-0.631763,-0.64094,0.006489,-0.654375,-0.654375,-0.591382,-0.717368,0.089086,-0.54775,-0.54775,-0.54775,-0.54775,0.0,-0.548054,-0.548054,-0.548054,-0.548054,0.0,-0.545045,-0.545045,-0.543217,-0.546873,0.002585,-0.542845,-0.542845,-0.53827,-0.54742,0.00647,-0.5488,-0.5488,-0.54123,-0.556371,0.010706,-0.565447,-0.565447,-0.548746,-0.582148,0.023618,5,3,2.5,1.5,0.333333,0.5,-0.241226,0.0
3,C_ID_0001793786,31,0.0,0.451613,0.0,5,1,5,7,14,21,31,1,2,6,6,13,10,-336.290323,-330.0,-318,-364,15.616644,33.290323,27.0,61,15,15.616644,-86.709677,-93.0,-59,-105,15.616644,1.322581,1.0,2,1,0.475191,-0.22962,-0.007407,-0.372748,3.129932,-0.737892,0.947223,0.0,0.0,0.0,0.0,0.0,0.0,0.006536,-0.14726,1.564966,-0.363311,0.449568,-0.007407,-0.372748,3.129932,-0.737892,0.947223,2017-12-31 17:35:56,2017-11-15 15:44:20,-0.549223,-0.55564,-0.369453,-0.569242,0.048644,-0.631014,-0.631014,-0.631014,-0.631014,0.0,-0.539474,-0.547887,-0.369453,-0.564584,0.047435,-0.535044,-0.546606,-0.500178,-0.551449,0.0188,-0.586427,-0.615356,-0.365551,-0.658111,0.08173,-0.578262,-0.616204,-0.118237,-0.692982,0.109035,-0.428241,-0.577258,0.755743,-0.737331,0.354734,-0.569132,-0.569132,-0.569132,-0.569132,0.0,-0.565011,-0.563817,-0.563817,-0.567519,0.001759,-0.563166,-0.572546,-0.540079,-0.583396,0.01718,-0.561853,-0.54742,-0.532678,-0.600648,0.029539,-0.549151,-0.541716,-0.533251,-0.576709,0.014086,-0.55826,-0.55082,-0.530985,-0.603752,0.024195,46,33,1.483871,1.064516,0.659574,0.702128,-0.004886,0.0
4,C_ID_000183fdda,11,0.0,0.181818,0.0,1,3,2,2,6,9,11,2,2,7,6,9,8,-200.181818,-190.0,-182,-241,21.798248,20.181818,10.0,61,2,21.798248,-101.818182,-112.0,-61,-120,21.798248,1.272727,1.0,2,1,0.467099,-6.590778,-0.599162,-0.665765,-0.10768,-0.732332,0.182877,17.0,1.7,1.0,4.0,1.0,1.05935,-0.277157,-0.315602,-0.035893,-0.366166,0.09668,-0.275101,-0.327661,-0.09192,-0.366166,0.101957,2018-04-30 14:59:53,2018-03-02 12:26:26,-0.550852,-0.550852,-0.550852,-0.550852,0.0,-0.314005,-0.606486,0.037708,-0.606486,0.336037,-0.546287,-0.545604,-0.545604,-0.553123,0.002267,-0.552149,-0.55222,-0.551443,-0.55222,0.000234,-0.573536,-0.652973,-0.297452,-0.658111,0.113614,-0.587847,-0.648222,-0.358444,-0.692982,0.105064,-0.607778,-0.637794,-0.407699,-0.732332,0.096114,-0.547733,-0.54775,-0.547688,-0.54775,2.9e-05,-0.547268,-0.548054,-0.545173,-0.548054,0.001346,-0.547632,-0.546106,-0.539581,-0.558738,0.00551,-0.543242,-0.53827,-0.532678,-0.569421,0.014043,-0.549423,-0.54679,-0.533251,-0.56823,0.01209,-0.550905,-0.55082,-0.527109,-0.582148,0.018319,59,50,5.363636,4.545455,0.183333,0.833333,-0.109846,0.283333


In [50]:
train = pd.merge(train, new, on='card_id', how='left')
test = pd.merge(test, new, on='card_id', how='left')
del new; gc.collect()

28

In [51]:
train['outliers'] = 0
train.loc[train['target'] < -30, 'outliers'] = 1
train['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [52]:
for df in [train, test]:
    ###
    df['c_p2p_diff'] = df['hist_p2p_vs_count'] - df['new_p2p_vs_count']
    df['c_sleep_diff'] = df['hist_sleep_vs_count'] - df['new_sleep_vs_count']
    df['c_p_diff'] = df['hist_purchase_amount_mean'] - df['new_purchase_amount_mean']
    df['c_i_diff'] = df['hist_installments_mean'] - df['new_installments_mean']
    ###
    df['p2p_count_diff'] = df['hist_count_vs_p2p'] - df['new_count_vs_p2p']
    df['p2p_sleep_diff'] = df['hist_sleep_vs_p2p'] - df['new_sleep_vs_p2p']
    df['p2p_p_diff'] = df['hist_p_vs_p2p'] - df['new_p_vs_p2p']
    df['p2p_i_diff'] = df['hist_i_vs_p2p'] - df['new_i_vs_p2p']
    ###
    df['c_p2p_diff_vs'] = df['c_p2p_diff'] / df['hist_p2p_vs_count']
    df['c_sleep_diff_vs'] = df['c_sleep_diff'] / df['hist_sleep_vs_count']
    df['c_p_diff_vs']  = df['c_p_diff'] / df['hist_purchase_amount_mean']
    df['c_i_diff_vs'] = df['c_i_diff'] / df['hist_installments_mean']
    ###
    df['p2p_count_diff_vs'] = df['p2p_count_diff'] / df['hist_count_vs_p2p']
    df['p2p_sleep_diff_vs'] = df['p2p_sleep_diff'] / df['hist_sleep_vs_p2p']
    df['p2p_p_diff_vs']  = df['p2p_p_diff'] / df['hist_p_vs_p2p']
    df['p2p_i_diff_vs'] = df['p2p_i_diff'] / df['hist_i_vs_p2p']
    ###
#     df['count_sum'] = df['hist_transactions_count'] + df['new_transactions_count']
#     df['p_sum'] = df['hist_purchase_amount_sum'] + df['new_purchase_amount_sum']
#     df['i_sum'] = df['hist_installments_sum'] + df['new_installments_sum']
#     df['p_sum_vs_count'] = df['p_sum'] / df['count_sum'] # mean
#     df['i_sum_vs_count'] = df['i_sum'] / df['count_sum'] # mean
    ###
#     df['gap'] = a2p(df['hist_purchase_date_max'], df['new_purchase_date_min'])
    ###
#     df['time'] = a2p(df['hist_purchase_date_min'], df['new_purchase_date_max'])
#     df['sleep_sum'] = df['hist_sleep'] + df['new_sleep']

#     df['time_vs_count'] = df['time'] / df['count_sum']
#     df['sleep_sum_vs_count'] = df['sleep_sum'] / df['count_sum']
    
#     df['count_sum_vs_time'] = df['count_sum'] / (df['time'].abs() + 1)
#     df['sleep_sum_vs_time'] = df['sleep_sum'] / (df['time'].abs() + 1)
#     df['p_sum_vs_time'] = df['p_sum'] / (df['time'].abs() + 1)
#     df['i_sum_vs_time'] = df['i_sum'] / (df['time'].abs() + 1)
    ###
    
    ###
    for f in ['hist_purchase_date_max','hist_purchase_date_min', 
              'new_purchase_date_max', 'new_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9

In [53]:
train['hist_p2p'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [54]:
train['new_p2p'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [55]:
train['hist_sleep'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [56]:
train['new_sleep'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [57]:
train['c_p2p_diff'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [58]:
# train['c_sleep_diff'].plot.hist(bins=50)

In [59]:
train['c_p_diff'].apply(lambda x: min(x, 10)).plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [60]:
train['c_i_diff'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [61]:
# train['p2p_sleep_diff'].plot.hist(bins=50)

In [62]:
# train['p2p_count_diff'].plot.hist(bins=50)

In [63]:
train['p2p_p_diff'].apply(lambda x: min(x, 10)).plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [64]:
train['p2p_i_diff'].plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [65]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,authorized_flag_mean,hist_transactions_count,hist_is_month_start_mean,hist_weekend_mean,hist_category_1_mean,hist_category_2_nunique,hist_category_3_nunique,hist_state_id_nunique,hist_city_id_nunique,hist_subsector_id_nunique,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_quarter_nunique,hist_month_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,hist_day_nunique,hist_hour_nunique,hist_a2p_mean,hist_a2p_median,hist_a2p_max,hist_a2p_min,hist_a2p_std,hist_p2r_mean,hist_p2r_median,hist_p2r_max,hist_p2r_min,hist_p2r_std,hist_p2now_mean,hist_p2now_median,hist_p2now_max,hist_p2now_min,hist_p2now_std,hist_month_lag_mean,hist_month_lag_median,hist_month_lag_max,hist_month_lag_min,hist_month_lag_std,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_median,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_mean,hist_installments_median,hist_installments_max,hist_installments_min,hist_installments_std,hist_p_vs_m_mean,hist_p_vs_m_median,hist_p_vs_m_max,hist_p_vs_m_min,hist_p_vs_m_std,hist_p_vs_i_mean,hist_p_vs_i_median,hist_p_vs_i_max,hist_p_vs_i_min,hist_p_vs_i_std,hist_purchase_date_max,hist_purchase_date_min,hist_category_2_p_mean_mean,hist_category_2_p_mean_median,hist_category_2_p_mean_max,hist_category_2_p_mean_min,hist_category_2_p_mean_std,hist_category_3_p_mean_mean,hist_category_3_p_mean_median,hist_category_3_p_mean_max,hist_category_3_p_mean_min,hist_category_3_p_mean_std,hist_state_id_p_mean_mean,hist_state_id_p_mean_median,hist_state_id_p_mean_max,hist_state_id_p_mean_min,hist_state_id_p_mean_std,hist_city_id_p_mean_mean,hist_city_id_p_mean_median,hist_city_id_p_mean_max,hist_city_id_p_mean_min,hist_city_id_p_mean_std,hist_subsector_id_p_mean_mean,hist_subsector_id_p_mean_median,hist_subsector_id_p_mean_max,hist_subsector_id_p_mean_min,hist_subsector_id_p_mean_std,hist_merchant_category_id_p_mean_mean,hist_merchant_category_id_p_mean_median,hist_merchant_category_id_p_mean_max,hist_merchant_category_id_p_mean_min,hist_merchant_category_id_p_mean_std,hist_merchant_id_p_mean_mean,hist_merchant_id_p_mean_median,hist_merchant_id_p_mean_max,hist_merchant_id_p_mean_min,hist_merchant_id_p_mean_std,hist_quarter_p_mean_mean,hist_quarter_p_mean_median,hist_quarter_p_mean_max,hist_quarter_p_mean_min,hist_quarter_p_mean_std,hist_month_p_mean_mean,hist_month_p_mean_median,hist_month_p_mean_max,hist_month_p_mean_min,hist_month_p_mean_std,hist_weekofyear_p_mean_mean,hist_weekofyear_p_mean_median,hist_weekofyear_p_mean_max,hist_weekofyear_p_mean_min,hist_weekofyear_p_mean_std,hist_dayofweek_p_mean_mean,hist_dayofweek_p_mean_median,hist_dayofweek_p_mean_max,hist_dayofweek_p_mean_min,hist_dayofweek_p_mean_std,hist_day_p_mean_mean,hist_day_p_mean_median,hist_day_p_mean_max,hist_day_p_mean_min,hist_day_p_mean_std,hist_hour_p_mean_mean,hist_hour_p_mean_median,hist_hour_p_mean_max,hist_hour_p_mean_min,hist_hour_p_mean_std,hist_first_year,hist_first_quarter,hist_first_month,hist_re_year,hist_re_quarter,hist_re_month,hist_now_year,hist_now_quarter,hist_now_month,hist_a2r,hist_r2now,hist_a2now,hist_p2p,hist_sleep,hist_p2p_vs_count,hist_sleep_vs_count,hist_count_vs_p2p,hist_sleep_vs_p2p,hist_p_vs_p2p,hist_i_vs_p2p,new_transactions_count,new_is_month_start_mean,new_weekend_mean,new_category_1_mean,new_category_2_nunique,new_category_3_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_merchant_category_id_nunique,new_merchant_id_nunique,new_quarter_nunique,new_month_nunique,new_weekofyear_nunique,new_dayofweek_nunique,new_day_nunique,new_hour_nunique,new_a2p_mean,new_a2p_median,new_a2p_max,new_a2p_min,new_a2p_std,new_p2r_mean,new_p2r_median,new_p2r_max,new_p2r_min,new_p2r_std,new_p2now_mean,new_p2now_median,new_p2now_max,new_p2now_min,new_p2now_std,new_month_lag_mean,new_month_lag_median,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_median,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_median,new_installments_max,new_installments_min,new_installments_std,new_p_vs_m_mean,new_p_vs_m_median,new_p_vs_m_max,new_p_vs_m_min,new_p_vs_m_std,new_p_vs_i_mean,new_p_vs_i_median,new_p_vs_i_max,new_p_vs_i_min,new_p_vs_i_std,new_purchase_date_max,new_purchase_date_min,new_category_2_p_mean_mean,new_category_2_p_mean_median,new_category_2_p_mean_max,new_category_2_p_mean_min,new_category_2_p_mean_std,new_category_3_p_mean_mean,new_category_3_p_mean_median,new_category_3_p_mean_max,new_category_3_p_mean_min,new_category_3_p_mean_std,new_state_id_p_mean_mean,new_state_id_p_mean_median,new_state_id_p_mean_max,new_state_id_p_mean_min,new_state_id_p_mean_std,new_city_id_p_mean_mean,new_city_id_p_mean_median,new_city_id_p_mean_max,new_city_id_p_mean_min,new_city_id_p_mean_std,new_subsector_id_p_mean_mean,new_subsector_id_p_mean_median,new_subsector_id_p_mean_max,new_subsector_id_p_mean_min,new_subsector_id_p_mean_std,new_merchant_category_id_p_mean_mean,new_merchant_category_id_p_mean_median,new_merchant_category_id_p_mean_max,new_merchant_category_id_p_mean_min,new_merchant_category_id_p_mean_std,new_merchant_id_p_mean_mean,new_merchant_id_p_mean_median,new_merchant_id_p_mean_max,new_merchant_id_p_mean_min,new_merchant_id_p_mean_std,new_quarter_p_mean_mean,new_quarter_p_mean_median,new_quarter_p_mean_max,new_quarter_p_mean_min,new_quarter_p_mean_std,new_month_p_mean_mean,new_month_p_mean_median,new_month_p_mean_max,new_month_p_mean_min,new_month_p_mean_std,new_weekofyear_p_mean_mean,new_weekofyear_p_mean_median,new_weekofyear_p_mean_max,new_weekofyear_p_mean_min,new_weekofyear_p_mean_std,new_dayofweek_p_mean_mean,new_dayofweek_p_mean_median,new_dayofweek_p_mean_max,new_dayofweek_p_mean_min,new_dayofweek_p_mean_std,new_day_p_mean_mean,new_day_p_mean_median,new_day_p_mean_max,new_day_p_mean_min,new_day_p_mean_std,new_hour_p_mean_mean,new_hour_p_mean_median,new_hour_p_mean_max,new_hour_p_mean_min,new_hour_p_mean_std,new_p2p,new_sleep,new_p2p_vs_count,new_sleep_vs_count,new_count_vs_p2p,new_sleep_vs_p2p,new_p_vs_p2p,new_i_vs_p2p,outliers,c_p2p_diff,c_sleep_diff,c_p_diff,c_i_diff,p2p_count_diff,p2p_sleep_diff,p2p_p_diff,p2p_i_diff,c_p2p_diff_vs,c_sleep_diff_vs,c_p_diff_vs,c_i_diff_vs,p2p_count_diff_vs,p2p_sleep_diff_vs,p2p_p_diff_vs,p2p_i_diff_vs
0,2017-06-01,C_ID_92a2005557,5,2,1,-0.820283,0.95,260,0.026923,0.346154,0.0,2,2,3,7,21,41,95,4,9,35,7,31,23,-139.323077,-137.5,-26,-269,74.281861,-132.676923,-134.5,-3,-246,74.281861,-254.676923,-256.5,-125,-368,74.281861,-3.911538,-4.0,0,-8,2.397687,-165.968739,-0.638341,-0.698042,2.258395,-0.739395,0.212139,4.0,0.015385,0.0,1.0,0.0,0.123314,-0.190644,-0.122982,0.752798,-0.738944,0.178324,-0.63318,-0.696997,2.258395,-0.739395,0.215299,1519551000.0,1498573000.0,0.146639,0.14957,0.14957,-0.104457,0.027181,0.350134,0.361926,0.361926,-0.404556,0.094518,0.31124,0.317017,0.317017,-0.272357,0.053997,0.817639,0.87204,0.87204,-0.538153,0.25201,0.727576,-0.1375,2.180532,-0.678829,1.141637,0.192325,-0.235195,7.449148,-0.691346,1.686744,-0.112555,-0.451573,6.305928,-0.729832,1.519546,-0.141544,-0.175738,1.13676,-0.209863,0.142832,-0.145904,-0.12716,-0.09554,-0.273128,0.053426,-0.137921,-0.141471,0.196065,-0.41125,0.16129,0.125998,-0.156621,1.755822,-0.31751,0.701612,-0.032638,-0.178445,5.85962,-0.377722,0.919694,0.089587,-0.118198,26.71146,-0.49118,2.359636,2017,2,6,2018,1,2,2018,2,6,-272,-122,-394,242,211,0.930769,0.811538,1.069959,0.868313,-0.682999,0.016461,23.0,0.0,0.26087,0.0,1.0,1.0,1.0,3.0,10.0,14.0,23.0,2.0,2.0,7.0,7.0,17.0,8.0,-303.26087,-303.0,-277.0,-332.0,16.591941,31.26087,31.0,60.0,5.0,16.591941,-90.73913,-91.0,-62.0,-117.0,16.591941,1.478261,1.0,2.0,1.0,0.510754,-13.244202,-0.575835,-0.58118,-0.296112,-0.724368,0.135812,0.0,0.0,0.0,0.0,0.0,0.0,-0.240423,-0.238952,-0.102586,-0.362184,0.072041,-0.575835,-0.58118,-0.296112,-0.724368,0.135812,1525001000.0,1520259000.0,-0.569242,-0.569242,-0.569242,-0.569242,0.0,-0.631014,-0.631014,-0.631014,-0.631014,0.0,-0.564584,-0.564584,-0.564584,-0.564584,0.0,-0.554127,-0.546606,-0.546606,-0.605382,0.017191,-0.625,-0.658111,-0.297452,-0.720933,0.0891,-0.607737,-0.631455,-0.194155,-0.720933,0.113723,-0.562181,-0.569305,-0.114595,-0.710157,0.138622,-0.54772,-0.54775,-0.547688,-0.54775,3.2e-05,-0.546676,-0.548054,-0.545173,-0.548054,0.001471,-0.547299,-0.544412,-0.539581,-0.558738,0.006451,-0.551089,-0.54742,-0.532678,-0.600648,0.022879,-0.551153,-0.549001,-0.531164,-0.576709,0.012799,-0.537761,-0.534288,-0.527109,-0.551282,0.009281,54.0,37.0,2.347826,1.608696,0.418182,0.672727,-0.240804,0.0,0,-1.417057,-0.797157,-0.062506,0.015385,0.651777,0.195585,-0.442195,0.016461,-1.522458,-0.982279,0.09792,1.0,0.609161,0.225248,0.647432,1.0
1,2017-01-01,C_ID_3d0044924f,4,1,0,0.392913,0.968571,350,0.04,0.377143,0.088571,2,3,3,9,24,57,142,4,12,50,7,31,24,-226.942857,-213.5,-5,-395,116.976167,-168.057143,-181.5,0,-390,116.976167,-288.057143,-301.5,-120,-510,116.976167,-5.031429,-5.0,0,-12,3.804934,-210.006336,-0.600018,-0.70859,4.630299,-0.7424,0.384967,545.0,1.566092,1.0,10.0,1.0,1.50262,-0.20915,-0.104141,0.388568,-0.737892,0.227495,-0.291613,-0.35437,1.543433,-0.3712,0.151257,1517438000.0,1483720000.0,0.104922,0.14957,0.14957,-0.354523,0.14343,-0.296877,-0.404556,0.106023,-0.404556,0.208284,0.245398,0.317017,0.317017,-0.354523,0.200215,0.663954,0.87204,0.87204,-0.583225,0.456752,0.358988,-0.190243,2.180532,-0.489419,0.969429,0.319361,-0.278704,7.143294,-0.696095,1.930573,-0.390536,-0.60645,19.761014,-0.731393,1.20487,0.079278,-0.175738,1.13676,-0.209863,0.516392,0.046079,-0.135695,3.939885,-0.273128,0.910172,0.027258,-0.209456,17.115643,-0.468301,1.846683,0.123919,-0.117756,1.755822,-0.31751,0.695703,0.044029,-0.178445,5.85962,-0.377722,1.151931,-0.046121,-0.259993,26.71146,-0.49118,2.043329,2017,1,1,2018,1,1,2018,2,5,-395,-120,-515,390,359,1.114286,1.025714,0.895141,0.918159,-0.537101,1.393862,6.0,0.166667,0.0,0.0,1.0,1.0,1.0,1.0,4.0,5.0,6.0,1.0,2.0,4.0,4.0,4.0,5.0,-422.5,-421.5,-396.0,-453.0,26.402651,27.5,26.5,58.0,1.0,26.402651,-92.5,-93.5,-62.0,-119.0,26.402651,1.5,1.5,2.0,1.0,0.547723,-4.355735,-0.725956,-0.732633,-0.701858,-0.73941,0.014326,6.0,1.0,1.0,1.0,1.0,0.0,-0.303226,-0.305276,-0.233953,-0.369705,0.070442,-0.362978,-0.366316,-0.350929,-0.369705,0.007163,1522393000.0,1517505000.0,-0.569242,-0.569242,-0.569242,-0.569242,0.0,-0.606486,-0.606486,-0.606486,-0.606486,0.0,-0.564584,-0.564584,-0.564584,-0.564584,0.0,-0.546606,-0.546606,-0.546606,-0.546606,0.0,-0.545669,-0.621847,-0.098744,-0.658111,0.219807,-0.547403,-0.623829,-0.095813,-0.692982,0.223058,-0.587752,-0.643117,-0.183152,-0.719306,0.202168,-0.54775,-0.54775,-0.54775,-0.54775,0.0,-0.547308,-0.547308,-0.546561,-0.548054,0.000818,-0.549433,-0.548745,-0.542877,-0.558738,0.005394,-0.536524,-0.534049,-0.532678,-0.54742,0.00572,-0.545906,-0.54035,-0.531164,-0.56823,0.0163,-0.545826,-0.543724,-0.541533,-0.553624,0.005167,56.0,52.0,9.333333,8.666667,0.105263,0.912281,-0.076416,0.105263,0,-8.219048,-7.640952,0.125938,0.566092,0.789878,0.005878,-0.460684,1.288599,-7.376068,-7.449396,-0.20989,0.361468,0.882406,0.006402,0.857724,0.924481
2,2016-08-01,C_ID_d639edf6cd,2,2,0,0.688056,0.953488,43,0.0,0.255814,0.0,2,1,2,5,7,8,13,4,10,22,7,19,14,-304.302326,-264.0,-163,-575,117.06338,-271.697674,-312.0,-1,-413,117.06338,-393.697674,-434.0,-123,-535,117.06338,-8.604651,-10.0,0,-13,3.842987,-29.167391,-0.678311,-0.698868,-0.145847,-0.730138,0.08738,0.0,0.0,0.0,0.0,0.0,0.0,-0.098675,-0.063768,-0.0452,-0.661287,0.103856,-0.678311,-0.698868,-0.145847,-0.730138,0.08738,1519759000.0,1484123000.0,-0.080826,-0.104457,0.14957,-0.104457,0.074659,0.361926,0.361926,0.361926,0.361926,0.0,-0.096862,-0.139312,0.317017,-0.139312,0.134116,-0.153615,-0.162348,0.87204,-0.575966,0.174145,-0.404135,-0.489419,0.750316,-0.489419,0.209394,-0.400542,-0.49024,0.796024,-0.49024,0.25628,-0.518794,-0.58631,2.314236,-0.68332,0.444498,0.150787,-0.175738,1.13676,-0.209863,0.585625,0.503803,-0.132023,3.939885,-0.246819,1.534467,1.80795,-0.233158,17.115643,-0.468301,5.620803,-0.062923,-0.156621,1.755822,-0.31751,0.413613,0.246028,-0.152958,5.85962,-0.377722,1.562317,-0.283442,-0.325941,0.455703,-0.49118,0.195386,2016,3,8,2018,1,2,2018,2,6,-576,-122,-698,412,393,9.581395,9.139535,0.104116,0.951574,-0.070623,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-635.0,-635.0,-635.0,-635.0,,59.0,59.0,59.0,59.0,,-63.0,-63.0,-63.0,-63.0,,2.0,2.0,2.0,2.0,,-0.700326,-0.700326,-0.700326,-0.700326,-0.700326,,0.0,0.0,0.0,0.0,0.0,,-0.233442,-0.233442,-0.233442,-0.233442,,-0.700326,-0.700326,-0.700326,-0.700326,,1524937000.0,1524937000.0,-0.549015,-0.549015,-0.549015,-0.549015,,-0.631014,-0.631014,-0.631014,-0.631014,,-0.547887,-0.547887,-0.547887,-0.547887,,-0.545403,-0.545403,-0.545403,-0.545403,,-0.538326,-0.538326,-0.538326,-0.538326,,-0.689889,-0.689889,-0.689889,-0.689889,,-0.693672,-0.693672,-0.693672,-0.693672,,-0.547688,-0.547688,-0.547688,-0.547688,,-0.545173,-0.545173,-0.545173,-0.545173,,-0.539581,-0.539581,-0.539581,-0.539581,,-0.569421,-0.569421,-0.569421,-0.569421,,-0.54934,-0.54934,-0.54934,-0.54934,,-0.541533,-0.541533,-0.541533,-0.541533,,0.0,-1.0,0.0,-1.0,1.0,-1.0,-0.700326,0.0,0,9.581395,10.139535,0.022014,0.0,-0.895884,1.951574,0.629702,0.0,1.0,1.109415,-0.032454,,-8.604651,2.050891,-8.916364,
3,2017-09-01,C_ID_186d6a6901,4,3,0,0.142495,1.0,77,0.0,0.142857,0.155844,3,3,5,7,13,25,50,3,6,20,7,25,16,-81.883117,-55.0,-25,-180,53.380209,-98.116883,-125.0,0,-155,53.380209,-220.116883,-247.0,-122,-277,53.380209,-2.831169,-4.0,0,-5,1.802065,-49.491364,-0.642745,-0.707989,1.445596,-0.740897,0.261624,86.0,1.146667,1.0,3.0,1.0,0.484722,-0.259738,-0.145412,0.361399,-0.740897,0.233968,-0.315587,-0.354671,0.722798,-0.370449,0.140188,1519818000.0,1506443000.0,-0.141348,-0.158951,0.14957,-0.354523,0.144064,-0.346114,-0.404556,0.106023,-0.404556,0.161839,-0.181744,-0.228982,0.317017,-0.354523,0.199747,-0.13796,-0.228982,0.87204,-0.387329,0.351249,0.135718,-0.207941,2.180532,-0.489419,0.770859,0.420718,-0.263905,7.143294,-0.561234,2.039278,-0.453346,-0.625347,5.229575,-0.737291,0.723303,-0.177328,-0.175738,-0.118619,-0.209863,0.028403,-0.158728,-0.135695,-0.12716,-0.273128,0.047615,-0.147424,-0.135573,0.025166,-0.384181,0.136687,-0.057641,-0.156621,1.755822,-0.31751,0.48724,0.008477,-0.146322,5.85962,-0.377722,0.974175,-0.160146,-0.133343,0.326119,-0.49118,0.194005,2017,3,9,2018,1,2,2018,2,6,-180,-122,-302,154,129,2.0,1.675325,0.496774,0.832258,-0.319299,0.554839,7.0,0.0,0.428571,0.142857,2.0,2.0,2.0,2.0,5.0,6.0,7.0,2.0,2.0,5.0,4.0,7.0,5.0,-215.285714,-219.0,-187.0,-229.0,15.195551,35.285714,39.0,49.0,7.0,15.195551,-86.714286,-83.0,-73.0,-115.0,15.195551,1.714286,2.0,2.0,1.0,0.48795,-4.654372,-0.66491,-0.69161,-0.56674,-0.734135,0.065882,6.0,1.0,1.0,1.0,1.0,0.0,-0.252163,-0.238952,-0.188913,-0.350238,0.052891,-0.340636,-0.348022,-0.290823,-0.367068,0.027202,1524049000.0,1520424000.0,-0.530703,-0.557578,-0.369453,-0.557578,0.071104,-0.514983,-0.606486,0.034033,-0.606486,0.242093,-0.53514,-0.562755,-0.369453,-0.562755,0.073061,-0.531814,-0.562755,-0.346173,-0.562755,0.08186,-0.605846,-0.658111,-0.49771,-0.664686,0.069691,-0.604549,-0.631455,-0.487311,-0.692982,0.084932,-0.584559,-0.641744,-0.294025,-0.724446,0.145081,-0.547705,-0.547688,-0.547688,-0.54775,3e-05,-0.545996,-0.545173,-0.545173,-0.548054,0.001406,-0.545242,-0.546106,-0.5431,-0.546995,0.001688,-0.553371,-0.534049,-0.533353,-0.600648,0.026851,-0.556162,-0.555847,-0.54549,-0.563469,0.007009,-0.54737,-0.55082,-0.527109,-0.580837,0.021264,41.0,34.0,5.857143,4.857143,0.166667,0.809524,-0.110818,0.142857,0,-3.857143,-3.181818,0.022165,0.146667,0.330108,0.022734,-0.208481,0.411982,-1.928571,-1.899225,-0.034485,0.127907,0.664502,0.027316,0.652932,0.742525
4,2017-11-01,C_ID_cdbd2c0db2,1,3,0,-0.159749,0.962406,133,0.037594,0.315789,0.112782,4,2,6,6,17,26,67,2,4,17,7,30,22,-67.398496,-68.0,-11,-119,32.095487,-51.601504,-51.0,0,-108,32.095487,-173.601504,-173.0,-122,-230,32.095487,-1.285714,-1.0,0,-3,1.0267,-48.687656,-0.366073,-0.689807,7.193041,-0.746156,1.352094,182.0,1.368421,1.0,12.0,1.0,1.896862,-0.102105,-0.287412,7.193041,-0.737892,1.289387,-0.276975,-0.344904,0.553311,-0.373078,0.184864,1519850000.0,1510445000.0,-0.157401,-0.158951,0.14957,-0.354523,0.107599,-0.377684,-0.404556,0.106023,-0.404556,0.114442,-0.215913,-0.228982,0.317017,-0.354523,0.142638,-0.188491,-0.228982,0.87204,-0.402736,0.275255,-0.064151,-0.23584,2.582718,-0.489419,0.513529,-0.09071,-0.171107,7.143294,-0.690491,0.97246,-0.582087,-0.67901,0.601468,-0.737291,0.229406,-0.196521,-0.209863,-0.175738,-0.209863,0.016715,-0.198934,-0.246819,-0.12716,-0.273128,0.058268,-0.198722,-0.23409,0.102304,-0.41125,0.129125,0.086637,-0.156621,1.755822,-0.31751,0.668032,-0.03976,-0.190643,5.85962,-0.377722,0.909439,0.046895,-0.133343,26.71146,-0.49118,2.342355,2017,4,11,2018,1,2,2018,2,6,-119,-122,-241,108,78,0.81203,0.586466,1.220183,0.715596,-0.446676,1.669725,36.0,0.055556,0.333333,0.055556,4.0,3.0,5.0,5.0,10.0,17.0,36.0,2.0,2.0,8.0,7.0,22.0,14.0,-150.805556,-151.5,-121.0,-178.0,16.98204,31.805556,32.5,59.0,2.0,16.98204,-90.194444,-89.5,-63.0,-120.0,16.98204,1.555556,2.0,2.0,1.0,0.503953,-19.926237,-0.553507,-0.607447,0.450886,-0.739395,0.223821,36.0,1.028571,1.0,2.0,1.0,0.169031,-0.228974,-0.232778,0.150295,-0.369697,0.103839,-0.280579,-0.305835,0.225443,-0.369697,0.110812,1524941000.0,1519992000.0,-0.546467,-0.557578,-0.369453,-0.569242,0.043728,-0.5708,-0.606486,0.037708,-0.606486,0.149227,-0.548724,-0.562755,-0.369453,-0.564584,0.044416,-0.548534,-0.562755,-0.346173,-0.573628,0.050338,-0.560087,-0.658111,0.440745,-0.674678,0.252482,-0.532402,-0.631455,1.222615,-0.700957,0.332561,-0.519243,-0.621565,1.242607,-0.728822,0.352453,-0.547715,-0.547688,-0.547688,-0.54775,3.1e-05,-0.546453,-0.545173,-0.545173,-0.548054,0.001452,-0.547632,-0.546489,-0.539581,-0.558738,0.005974,-0.552228,-0.54742,-0.532678,-0.600648,0.022172,-0.550247,-0.54679,-0.535331,-0.576709,0.011504,-0.550688,-0.551282,-0.527109,-0.58415,0.019482,57.0,35.0,1.583333,0.972222,0.62069,0.603448,-0.343556,0.62069,0,-0.771303,-0.385756,0.187434,0.33985,0.599494,0.112148,-0.10312,1.049035,-0.949846,-0.657764,-0.512013,0.248352,0.491314,0.15672,0.230861,0.628268


In [66]:
test.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,authorized_flag_mean,hist_transactions_count,hist_is_month_start_mean,hist_weekend_mean,hist_category_1_mean,hist_category_2_nunique,hist_category_3_nunique,hist_state_id_nunique,hist_city_id_nunique,hist_subsector_id_nunique,hist_merchant_category_id_nunique,hist_merchant_id_nunique,hist_quarter_nunique,hist_month_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,hist_day_nunique,hist_hour_nunique,hist_a2p_mean,hist_a2p_median,hist_a2p_max,hist_a2p_min,hist_a2p_std,hist_p2r_mean,hist_p2r_median,hist_p2r_max,hist_p2r_min,hist_p2r_std,hist_p2now_mean,hist_p2now_median,hist_p2now_max,hist_p2now_min,hist_p2now_std,hist_month_lag_mean,hist_month_lag_median,hist_month_lag_max,hist_month_lag_min,hist_month_lag_std,hist_purchase_amount_sum,hist_purchase_amount_mean,hist_purchase_amount_median,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_std,hist_installments_sum,hist_installments_mean,hist_installments_median,hist_installments_max,hist_installments_min,hist_installments_std,hist_p_vs_m_mean,hist_p_vs_m_median,hist_p_vs_m_max,hist_p_vs_m_min,hist_p_vs_m_std,hist_p_vs_i_mean,hist_p_vs_i_median,hist_p_vs_i_max,hist_p_vs_i_min,hist_p_vs_i_std,hist_purchase_date_max,hist_purchase_date_min,hist_category_2_p_mean_mean,hist_category_2_p_mean_median,hist_category_2_p_mean_max,hist_category_2_p_mean_min,hist_category_2_p_mean_std,hist_category_3_p_mean_mean,hist_category_3_p_mean_median,hist_category_3_p_mean_max,hist_category_3_p_mean_min,hist_category_3_p_mean_std,hist_state_id_p_mean_mean,hist_state_id_p_mean_median,hist_state_id_p_mean_max,hist_state_id_p_mean_min,hist_state_id_p_mean_std,hist_city_id_p_mean_mean,hist_city_id_p_mean_median,hist_city_id_p_mean_max,hist_city_id_p_mean_min,hist_city_id_p_mean_std,hist_subsector_id_p_mean_mean,hist_subsector_id_p_mean_median,hist_subsector_id_p_mean_max,hist_subsector_id_p_mean_min,hist_subsector_id_p_mean_std,hist_merchant_category_id_p_mean_mean,hist_merchant_category_id_p_mean_median,hist_merchant_category_id_p_mean_max,hist_merchant_category_id_p_mean_min,hist_merchant_category_id_p_mean_std,hist_merchant_id_p_mean_mean,hist_merchant_id_p_mean_median,hist_merchant_id_p_mean_max,hist_merchant_id_p_mean_min,hist_merchant_id_p_mean_std,hist_quarter_p_mean_mean,hist_quarter_p_mean_median,hist_quarter_p_mean_max,hist_quarter_p_mean_min,hist_quarter_p_mean_std,hist_month_p_mean_mean,hist_month_p_mean_median,hist_month_p_mean_max,hist_month_p_mean_min,hist_month_p_mean_std,hist_weekofyear_p_mean_mean,hist_weekofyear_p_mean_median,hist_weekofyear_p_mean_max,hist_weekofyear_p_mean_min,hist_weekofyear_p_mean_std,hist_dayofweek_p_mean_mean,hist_dayofweek_p_mean_median,hist_dayofweek_p_mean_max,hist_dayofweek_p_mean_min,hist_dayofweek_p_mean_std,hist_day_p_mean_mean,hist_day_p_mean_median,hist_day_p_mean_max,hist_day_p_mean_min,hist_day_p_mean_std,hist_hour_p_mean_mean,hist_hour_p_mean_median,hist_hour_p_mean_max,hist_hour_p_mean_min,hist_hour_p_mean_std,hist_first_year,hist_first_quarter,hist_first_month,hist_re_year,hist_re_quarter,hist_re_month,hist_now_year,hist_now_quarter,hist_now_month,hist_a2r,hist_r2now,hist_a2now,hist_p2p,hist_sleep,hist_p2p_vs_count,hist_sleep_vs_count,hist_count_vs_p2p,hist_sleep_vs_p2p,hist_p_vs_p2p,hist_i_vs_p2p,new_transactions_count,new_is_month_start_mean,new_weekend_mean,new_category_1_mean,new_category_2_nunique,new_category_3_nunique,new_state_id_nunique,new_city_id_nunique,new_subsector_id_nunique,new_merchant_category_id_nunique,new_merchant_id_nunique,new_quarter_nunique,new_month_nunique,new_weekofyear_nunique,new_dayofweek_nunique,new_day_nunique,new_hour_nunique,new_a2p_mean,new_a2p_median,new_a2p_max,new_a2p_min,new_a2p_std,new_p2r_mean,new_p2r_median,new_p2r_max,new_p2r_min,new_p2r_std,new_p2now_mean,new_p2now_median,new_p2now_max,new_p2now_min,new_p2now_std,new_month_lag_mean,new_month_lag_median,new_month_lag_max,new_month_lag_min,new_month_lag_std,new_purchase_amount_sum,new_purchase_amount_mean,new_purchase_amount_median,new_purchase_amount_max,new_purchase_amount_min,new_purchase_amount_std,new_installments_sum,new_installments_mean,new_installments_median,new_installments_max,new_installments_min,new_installments_std,new_p_vs_m_mean,new_p_vs_m_median,new_p_vs_m_max,new_p_vs_m_min,new_p_vs_m_std,new_p_vs_i_mean,new_p_vs_i_median,new_p_vs_i_max,new_p_vs_i_min,new_p_vs_i_std,new_purchase_date_max,new_purchase_date_min,new_category_2_p_mean_mean,new_category_2_p_mean_median,new_category_2_p_mean_max,new_category_2_p_mean_min,new_category_2_p_mean_std,new_category_3_p_mean_mean,new_category_3_p_mean_median,new_category_3_p_mean_max,new_category_3_p_mean_min,new_category_3_p_mean_std,new_state_id_p_mean_mean,new_state_id_p_mean_median,new_state_id_p_mean_max,new_state_id_p_mean_min,new_state_id_p_mean_std,new_city_id_p_mean_mean,new_city_id_p_mean_median,new_city_id_p_mean_max,new_city_id_p_mean_min,new_city_id_p_mean_std,new_subsector_id_p_mean_mean,new_subsector_id_p_mean_median,new_subsector_id_p_mean_max,new_subsector_id_p_mean_min,new_subsector_id_p_mean_std,new_merchant_category_id_p_mean_mean,new_merchant_category_id_p_mean_median,new_merchant_category_id_p_mean_max,new_merchant_category_id_p_mean_min,new_merchant_category_id_p_mean_std,new_merchant_id_p_mean_mean,new_merchant_id_p_mean_median,new_merchant_id_p_mean_max,new_merchant_id_p_mean_min,new_merchant_id_p_mean_std,new_quarter_p_mean_mean,new_quarter_p_mean_median,new_quarter_p_mean_max,new_quarter_p_mean_min,new_quarter_p_mean_std,new_month_p_mean_mean,new_month_p_mean_median,new_month_p_mean_max,new_month_p_mean_min,new_month_p_mean_std,new_weekofyear_p_mean_mean,new_weekofyear_p_mean_median,new_weekofyear_p_mean_max,new_weekofyear_p_mean_min,new_weekofyear_p_mean_std,new_dayofweek_p_mean_mean,new_dayofweek_p_mean_median,new_dayofweek_p_mean_max,new_dayofweek_p_mean_min,new_dayofweek_p_mean_std,new_day_p_mean_mean,new_day_p_mean_median,new_day_p_mean_max,new_day_p_mean_min,new_day_p_mean_std,new_hour_p_mean_mean,new_hour_p_mean_median,new_hour_p_mean_max,new_hour_p_mean_min,new_hour_p_mean_std,new_p2p,new_sleep,new_p2p_vs_count,new_sleep_vs_count,new_count_vs_p2p,new_sleep_vs_p2p,new_p_vs_p2p,new_i_vs_p2p,c_p2p_diff,c_sleep_diff,c_p_diff,c_i_diff,p2p_count_diff,p2p_sleep_diff,p2p_p_diff,p2p_i_diff,c_p2p_diff_vs,c_sleep_diff_vs,c_p_diff_vs,c_i_diff_vs,p2p_count_diff_vs,p2p_sleep_diff_vs,p2p_p_diff_vs,p2p_i_diff_vs
0,2017-04-01,C_ID_0ab67a22ab,3,3,1,0.647059,68,0.044118,0.176471,0.338235,2,2,3,7,12,16,24,3,9,24,7,24,17,-148.602941,-155.0,-3,-272,74.718005,-125.397059,-119.0,-2,-271,74.718005,-245.397059,-239.0,-122,-391,74.718005,-3.632353,-3.5,0,-8,2.454994,-40.733733,-0.599025,-0.689206,0.235676,-0.743902,0.192268,141.0,2.073529,1.0,12.0,1.0,2.061127,-0.212196,-0.144573,0.026186,-0.739395,0.194647,-0.265675,-0.344603,0.078559,-0.371951,0.127292,1514510000.0,1491330000.0,-0.020932,0.14957,0.14957,-0.354523,0.240264,-0.239369,-0.404556,0.106023,-0.404556,0.240637,-0.297235,-0.309739,0.317017,-0.354523,0.134607,-0.427323,-0.545416,0.87204,-0.609648,0.317664,-0.102223,-0.140582,2.180532,-0.317189,0.328519,-0.17167,-0.278704,7.143294,-0.553336,0.973841,-0.518461,-0.605221,-0.1521,-0.709401,0.165475,0.248072,-0.118619,1.13676,-0.175738,0.598925,0.101336,-0.132023,3.939885,-0.273128,0.968122,-0.090937,-0.131721,0.340314,-0.384181,0.19359,-0.136097,-0.156621,1.755822,-0.31751,0.248438,-0.142654,-0.146322,0.177136,-0.377722,0.110874,-0.209355,-0.307431,0.455703,-0.445395,0.251965,2017,2,4,2017,4,12,2018,2,4,-274,-120,-394,268,244,3.941176,3.588235,0.252788,0.907063,-0.151427,0.524164,3.0,0.0,0.333333,0.0,1.0,2.0,1.0,3.0,3.0,3.0,3.0,1.0,1.0,3.0,3.0,3.0,3.0,-320.666667,-321.0,-308.0,-333.0,12.503333,46.666667,47.0,59.0,34.0,12.503333,-73.333333,-73.0,-61.0,-86.0,12.503333,2.0,2.0,2.0,2.0,0.0,-1.777156,-0.592385,-0.671775,-0.383266,-0.722114,0.182843,5.0,1.666667,1.0,3.0,1.0,1.154701,-0.197462,-0.223925,-0.127755,-0.240705,0.060948,-0.264254,-0.335888,-0.095817,-0.361057,0.146413,1519845000.0,1517651000.0,-0.569242,-0.569242,-0.569242,-0.569242,0.0,-0.391755,-0.606486,0.037708,-0.606486,0.371926,-0.572879,-0.572879,-0.572879,-0.572879,0.0,-0.595325,-0.596728,-0.575749,-0.613498,0.018914,-0.546388,-0.615356,-0.365697,-0.658111,0.157936,-0.568895,-0.616204,-0.397498,-0.692982,0.153318,-0.637906,-0.641898,-0.566239,-0.70558,0.069756,-0.54775,-0.54775,-0.54775,-0.54775,0.0,-0.546561,-0.546561,-0.546561,-0.546561,0.0,-0.551696,-0.552157,-0.542877,-0.560054,0.008598,-0.550065,-0.54742,-0.533353,-0.569421,0.018179,-0.547958,-0.54934,-0.54123,-0.553305,0.006155,-0.560239,-0.551282,-0.548746,-0.58069,0.017756,25.0,22.0,8.333333,7.333333,0.115385,0.846154,-0.068352,0.192308,-4.392157,-3.745098,-0.00664,0.406863,0.137403,0.060909,-0.083074,0.331856,-1.114428,-1.043716,0.011085,0.196217,0.543552,0.06715,0.548612,0.633115
1,2017-01-01,C_ID_130fd0cbdd,2,3,0,0.987179,78,0.0,0.217949,0.025641,3,2,3,4,12,16,27,2,5,20,7,27,18,-93.320513,-97.5,-12,-413,64.35383,-329.679487,-325.5,-10,-411,64.35383,-451.679487,-447.5,-132,-533,64.35383,-10.410256,-10.0,0,-13,2.164866,-49.136513,-0.629955,-0.679288,0.318817,-0.731881,0.154999,83.0,1.064103,1.0,4.0,1.0,0.405794,-0.069541,-0.057727,0.028983,-0.676283,0.094028,-0.311694,-0.339644,0.159409,-0.365941,0.083459,1518989000.0,1484321000.0,-0.159615,-0.158951,0.180375,-0.354523,0.049884,-0.391464,-0.404556,0.106023,-0.404556,0.081225,0.060412,0.078031,0.078031,-0.431205,0.088956,0.00592,0.023578,0.023578,-0.425863,0.080361,-0.13853,-0.245685,2.180532,-0.678829,0.580141,-0.175647,-0.436136,4.532535,-0.698283,0.961917,-0.497874,-0.604438,3.271515,-0.715133,0.462792,0.480713,1.13676,1.13676,-0.209863,0.677447,1.049399,-0.035313,3.939885,-0.246819,1.882838,2.70536,-0.152121,17.115643,-0.468301,6.489265,-0.065119,-0.156621,1.755822,-0.31751,0.435203,0.07223,-0.13868,5.85962,-0.377722,1.172931,-0.217898,-0.296543,0.455703,-0.49118,0.190537,2017,1,1,2018,1,2,2018,2,6,-423,-122,-545,401,374,5.141026,4.794872,0.19403,0.930348,-0.12223,0.206468,9.0,0.0,0.333333,0.222222,2.0,2.0,2.0,2.0,6.0,8.0,9.0,2.0,2.0,6.0,6.0,7.0,8.0,-448.111111,-435.0,-426.0,-474.0,20.781268,25.111111,12.0,51.0,3.0,20.781268,-96.888889,-110.0,-71.0,-119.0,20.781268,1.444444,1.0,2.0,1.0,0.527046,-5.944698,-0.660522,-0.656749,-0.506484,-0.740897,0.071147,11.0,1.222222,1.0,3.0,1.0,0.666667,-0.282974,-0.30591,-0.168828,-0.358435,0.071383,-0.316192,-0.328374,-0.126621,-0.370449,0.074061,1524247000.0,1520080000.0,-0.515773,-0.557578,-0.369453,-0.557578,0.082955,-0.534909,-0.606486,0.037708,-0.606486,0.214731,-0.519941,-0.562938,-0.369453,-0.562938,0.085319,-0.505944,-0.551593,-0.346173,-0.551593,0.090581,-0.549508,-0.549586,-0.365697,-0.658111,0.095627,-0.590809,-0.611491,-0.506288,-0.656303,0.061433,-0.66007,-0.660355,-0.552479,-0.741099,0.050751,-0.547722,-0.54775,-0.547688,-0.54775,3.3e-05,-0.546773,-0.548054,-0.545173,-0.548054,0.001518,-0.545191,-0.544412,-0.5431,-0.552157,0.002966,-0.551899,-0.54742,-0.532678,-0.600648,0.023537,-0.546716,-0.541269,-0.539271,-0.563288,0.008651,-0.553473,-0.548746,-0.527109,-0.58415,0.019471,48.0,41.0,5.333333,4.555556,0.183673,0.836735,-0.12132,0.22449,-0.192308,0.239316,0.030567,-0.15812,0.010356,0.093614,-0.00091,-0.018022,-0.037406,0.049911,-0.048522,-0.148594,0.053375,0.100622,0.007443,-0.087288
2,2017-08-01,C_ID_b709037bc5,5,1,1,0.692308,13,0.0,0.0,0.076923,3,3,4,4,6,8,9,3,6,7,4,7,7,-127.615385,-126.0,-24,-185,49.157126,-83.384615,-85.0,-26,-187,49.157126,-205.384615,-207.0,-148,-309,49.157126,-2.076923,-2.0,0,-6,1.754116,4.52884,0.348372,0.214624,2.525866,-0.536537,0.906547,47.0,4.7,4.0,10.0,1.0,3.12872,0.273168,0.091151,1.262933,-0.134134,0.487622,-0.015571,-0.018447,0.229624,-0.268268,0.150249,1517598000.0,1503673000.0,-0.104152,-0.104457,0.14957,-0.354523,0.1029,0.055769,0.106023,0.106023,-0.404556,0.139836,-0.130998,-0.139312,0.317017,-0.354523,0.150376,-0.144413,-0.162348,0.207556,-0.359418,0.120804,0.184905,0.151362,0.94426,-0.150459,0.267255,0.056388,-0.249279,1.215995,-0.553336,0.611124,2.370789,-0.143339,30.707943,-0.611614,8.531681,-0.184469,-0.175738,-0.118619,-0.209863,0.025971,-0.185603,-0.155056,-0.121147,-0.273128,0.064902,-0.222028,-0.23409,0.009732,-0.384181,0.138012,-0.172971,-0.156621,-0.059974,-0.273504,0.078112,0.367995,-0.140629,5.85962,-0.290368,1.657381,-0.285885,-0.325941,-0.032755,-0.49118,0.178758,2017,3,8,2018,1,2,2018,2,6,-211,-122,-333,161,154,12.384615,11.846154,0.080247,0.950617,0.027956,0.290123,2.0,0.5,0.0,0.5,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,2.0,2.0,2.0,2.0,-218.0,-218.0,-212.0,-224.0,8.485281,7.0,7.0,13.0,1.0,8.485281,-115.0,-115.0,-109.0,-121.0,8.485281,1.0,1.0,1.0,1.0,0.0,0.180138,0.090069,0.090069,0.904506,-0.724368,1.151788,11.0,5.5,5.5,10.0,1.0,6.363961,0.045034,0.045034,0.452253,-0.362184,0.575894,-0.139978,-0.139978,0.082228,-0.362184,0.314247,1520947000.0,1519916000.0,-0.459234,-0.459234,-0.369453,-0.549015,0.126969,-0.284389,-0.284389,0.037708,-0.606486,0.455514,-0.453044,-0.453044,-0.369453,-0.536635,0.118215,-0.448022,-0.448022,-0.346173,-0.54987,0.144035,-0.470837,-0.470837,-0.276988,-0.664686,0.274144,-0.378499,-0.378499,-0.058744,-0.698255,0.452203,-0.441391,-0.441391,-0.181781,-0.701001,0.367144,-0.54775,-0.54775,-0.54775,-0.54775,0.0,-0.548054,-0.548054,-0.548054,-0.548054,0.0,-0.547687,-0.547687,-0.543217,-0.552157,0.006321,-0.536159,-0.536159,-0.534049,-0.53827,0.002985,-0.553614,-0.553614,-0.543053,-0.564175,0.014936,-0.542785,-0.542785,-0.534288,-0.551282,0.012017,11.0,9.0,5.5,4.5,0.166667,0.75,0.015011,0.916667,6.884615,7.346154,0.258303,-0.8,-0.08642,0.200617,0.012944,-0.626543,0.555901,0.62013,0.741458,-0.170213,-1.076923,0.211039,0.463028,-2.159574
3,2017-12-01,C_ID_d27d835a9f,2,1,0,1.0,26,0.0,0.269231,0.0,1,3,1,1,11,18,23,2,3,6,7,11,9,-34.269231,-12.5,-3,-81,33.270477,-54.730769,-76.5,-8,-86,33.270477,-176.730769,-198.5,-130,-208,33.270477,-1.230769,-2.0,0,-2,0.951113,-13.690715,-0.526566,-0.587492,0.087965,-0.731881,0.219162,40.0,1.666667,1.0,6.0,1.0,1.34056,-0.328406,-0.21947,0.029322,-0.731881,0.251145,-0.251455,-0.293746,-0.020331,-0.365941,0.105024,1519127000.0,1512392000.0,0.14957,0.14957,0.14957,0.14957,0.0,-0.231477,-0.404556,0.106023,-0.404556,0.242876,0.317017,0.317017,0.317017,0.317017,0.0,0.87204,0.87204,0.87204,0.87204,0.0,0.091971,-0.140582,2.180532,-0.489419,0.651535,0.402851,-0.278704,7.143294,-0.608565,1.76355,0.088851,-0.529965,14.162992,-0.716833,2.880835,-0.190175,-0.175738,-0.175738,-0.209863,0.017193,-0.146021,-0.12716,-0.12716,-0.246819,0.032491,-0.20869,-0.29518,0.102304,-0.41125,0.17007,0.147854,-0.240371,1.755822,-0.31751,0.803369,-0.131512,-0.13512,0.177136,-0.36744,0.156669,-0.300105,-0.31832,0.25547,-0.49118,0.188545,2017,4,12,2018,1,2,2018,2,6,-89,-122,-211,77,66,2.961538,2.538462,0.333333,0.846154,-0.175522,0.512821,10.0,0.0,0.3,0.1,3.0,2.0,3.0,3.0,8.0,10.0,10.0,2.0,2.0,7.0,5.0,8.0,7.0,-111.9,-111.0,-93.0,-137.0,15.828596,22.9,22.0,48.0,4.0,15.828596,-99.1,-100.0,-74.0,-118.0,15.828596,1.3,1.0,2.0,1.0,0.483046,-5.743674,-0.574367,-0.581391,-0.44788,-0.671775,0.073166,29.0,2.9,2.0,12.0,1.0,3.3483,-0.25779,-0.2645,-0.149293,-0.317104,0.051744,-0.210945,-0.209654,-0.034452,-0.335888,0.102593,1524000000.0,1520162000.0,-0.54724,-0.569242,-0.369453,-0.569242,0.06279,-0.284389,-0.284389,0.037708,-0.606486,0.33952,-0.543401,-0.564584,-0.369453,-0.564584,0.061344,-0.526443,-0.546606,-0.346173,-0.546606,0.063341,-0.51991,-0.549586,-0.297452,-0.674678,0.10937,-0.50466,-0.51014,-0.29962,-0.661512,0.117167,-0.518888,-0.536101,-0.364665,-0.643995,0.107855,-0.547731,-0.54775,-0.547688,-0.54775,3e-05,-0.54719,-0.548054,-0.545173,-0.548054,0.001392,-0.547408,-0.546489,-0.5431,-0.558738,0.0047,-0.550386,-0.53827,-0.534049,-0.600648,0.022363,-0.556852,-0.559103,-0.539271,-0.576709,0.011465,-0.564639,-0.580763,-0.527109,-0.581573,0.023186,44.0,36.0,4.4,3.6,0.222222,0.8,-0.127637,0.644444,-1.438462,-1.061538,0.047801,-1.233333,0.111111,0.046154,-0.047885,-0.131624,-0.485714,-0.418182,-0.09078,-0.74,0.333333,0.054545,0.272814,-0.256667
4,2015-12-01,C_ID_2b5e3df5c2,5,1,1,0.790909,110,0.018182,0.190909,0.0,3,2,4,5,15,31,47,4,12,34,7,27,21,-614.609091,-615.5,-399,-819,141.926011,-205.390909,-204.5,-1,-421,141.926011,-327.390909,-326.5,-123,-543,141.926011,-6.227273,-6.5,0,-13,4.530547,25.139384,0.22854,-0.671775,15.782255,-0.746758,2.777764,120.0,1.090909,1.0,4.0,1.0,0.43988,0.009947,-0.06296,4.759868,-0.726622,0.76548,0.107407,-0.335888,7.891128,-0.373379,1.384675,1519728000.0,1483444000.0,-0.132308,-0.158951,0.180375,-0.158951,0.089769,-0.376706,-0.404556,0.106023,-0.404556,0.116479,0.108701,0.078031,0.551151,0.078031,0.106956,-0.336999,-0.425863,1.563969,-0.425863,0.322818,-0.071897,-0.245685,2.180532,-0.489419,0.572357,0.051363,-0.278704,7.143294,-0.553336,1.136305,-0.367042,-0.584207,0.661174,-0.710944,0.366086,0.032047,-0.209863,1.13676,-0.209863,0.492037,0.134613,-0.155056,3.939885,-0.273128,1.072386,-0.160691,-0.23409,0.340314,-0.468301,0.183891,-0.036406,-0.207237,1.755822,-0.31751,0.575449,0.22965,-0.143475,5.85962,-0.377722,1.480705,-0.225689,-0.271137,0.455703,-0.49118,0.216682,2015,4,12,2018,1,2,2018,2,6,-820,-122,-942,419,392,3.809091,3.563636,0.261905,0.933333,0.059856,0.285714,6.0,0.0,0.333333,0.0,2.0,3.0,2.0,2.0,4.0,5.0,6.0,2.0,2.0,3.0,4.0,5.0,5.0,-832.5,-827.5,-824.0,-863.0,15.083103,12.5,7.5,43.0,4.0,15.083103,-109.5,-114.5,-79.0,-118.0,15.083103,1.166667,1.0,2.0,1.0,0.408248,12.064997,2.010833,-0.654495,14.279604,-0.704082,6.028671,6.0,1.2,1.0,2.0,1.0,0.447214,0.991728,-0.327247,7.139802,-0.352041,3.018655,0.681235,-0.335888,4.759868,-0.352041,2.280061,1523535000.0,1520132000.0,-0.556457,-0.557578,-0.550852,-0.557578,0.002746,-0.392367,-0.606486,0.037708,-0.606486,0.331713,-0.559791,-0.562938,-0.544055,-0.562938,0.007709,-0.566257,-0.568039,-0.557349,-0.568039,0.004364,-0.554788,-0.658111,-0.297452,-0.674678,0.174399,-0.545649,-0.631455,-0.325557,-0.642706,0.143083,1.928088,-0.523168,14.279604,-0.710562,6.054033,-0.54774,-0.54775,-0.547688,-0.54775,2.5e-05,-0.547574,-0.548054,-0.545173,-0.548054,0.001176,-0.547424,-0.545704,-0.544412,-0.552157,0.0038,-0.559768,-0.542845,-0.533353,-0.600648,0.03199,-0.554256,-0.558385,-0.541269,-0.563288,0.009613,-0.566381,-0.560187,-0.534396,-0.607469,0.026687,39.0,34.0,6.5,5.666667,0.15,0.85,0.301625,0.15,-2.690909,-2.10303,-1.782293,-0.109091,0.111905,0.083333,-0.241769,0.135714,-0.706444,-0.590136,-7.798609,-0.1,0.427273,0.089286,-4.039203,0.475


In [67]:
train['authorized_flag_mean'].mean()

0.8933534742623435

In [68]:
test['authorized_flag_mean'].mean()

0.8930119505693119

In [69]:
print("Train Shape:", train.shape)
print("Test Shape:", test.shape)
gc.collect()

Train Shape: (201917, 304)
Test Shape: (123623, 302)


90

In [70]:
def summary(df):
    stats = []
    for col in df.columns:
        stats.append((col, df[col].nunique(), 
                      df[col].isnull().sum() * 100 / df.shape[0], 
                      df[col].value_counts(normalize=True, dropna=False).values[0] * 100, 
                      df[col].dtype))
    
    return pd.DataFrame(stats, columns=['feature', 'unique', 'missing', 'mode', 'type'])

In [71]:
train_summary = summary(train)
test_summary = summary(test)

In [72]:
tmp_1 = train_summary.sort_values('mode', ascending=False)
tmp_2 = test_summary.sort_values('mode', ascending=False)

In [73]:
tmp_1

Unnamed: 0,feature,unique,missing,mode,type
287,outliers,2,0.0,98.906977,int64
121,hist_dayofweek_p_mean_min,7,0.0,93.436907,float64
139,hist_now_year,2,0.0,92.246319,int64
106,hist_quarter_p_mean_min,4,0.0,91.651025,float64
41,hist_month_lag_max,12,0.0,89.991927,int64
120,hist_dayofweek_p_mean_max,7,0.0,86.64897,float64
140,hist_now_quarter,4,0.0,83.931516,int64
137,hist_re_quarter,4,0.0,79.116667,int64
131,hist_hour_p_mean_min,23,0.0,78.895289,float64
136,hist_re_year,2,0.0,78.289594,int64


In [74]:
tmp_2

Unnamed: 0,feature,unique,missing,mode,type
120,hist_dayofweek_p_mean_min,6,0.0,93.430834,float64
138,hist_now_year,2,0.0,92.364689,int64
105,hist_quarter_p_mean_min,4,0.0,91.699765,float64
40,hist_month_lag_max,12,0.0,90.163643,int64
119,hist_dayofweek_p_mean_max,7,0.0,86.591492,float64
139,hist_now_quarter,4,0.0,84.1866,int64
136,hist_re_quarter,4,0.0,79.254669,int64
130,hist_hour_p_mean_min,22,0.0,78.753145,float64
135,hist_re_year,2,0.0,78.478115,int64
153,new_is_month_start_mean,209,11.00766,76.184852,float64


In [75]:
train.to_csv("./data/pre_train.csv")

In [76]:
test.to_csv("./data/pre_test.csv")

In [77]:
tmp = train['target']

In [78]:
tmp.plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [79]:
(tmp*np.log10(2)).plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>

In [80]:
np.log2((np.exp2(tmp) - 0.0000000001) + 1).plot.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7f32064cd630>