In [2]:
import pandas as pd
import numpy as np
import os
os.chdir('..')

from tqdm import tqdm
from sklearn.linear_model import LinearRegression

In [3]:
transactions = pd.read_csv('transactions.csv')
transactions.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id
0,7391977,0 10:23:26,4814,1030,-2245.92,
1,7391977,1 10:19:29,6011,7010,56147.89,
2,7391977,1 10:20:56,4829,2330,-56147.89,
3,7391977,1 10:39:54,5499,1010,-1392.47,
4,7391977,2 15:33:42,5499,1010,-920.83,


Для обучения будем использовать только траты клиентов

Добавим признак обозначающий день транзакции

In [3]:
train_transactions = transactions[transactions.amount < 0].copy()
train_transactions['day'] = train_transactions.tr_datetime.apply(lambda dt: dt.split()[0]).astype(int)
train_transactions.head()

Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id,day
0,7391977,0 10:23:26,4814,1030,-2245.92,,0
2,7391977,1 10:20:56,4829,2330,-56147.89,,1
3,7391977,1 10:39:54,5499,1010,-1392.47,,1
4,7391977,2 15:33:42,5499,1010,-920.83,,2
5,7391977,2 15:53:49,5541,1010,-14643.37,,2


Так как нам тестовая выборка не дана, то мы сгенерируем её сами. Для этого возьмём все mcc_code представленные в обучающей выборке и предскажем их на 30 дней вперед.

In [4]:
test_transactions = pd.DataFrame(columns=train_transactions.mcc_code.unique(), 
                                 index=np.arange(1, 31) + train_transactions.day.max())
test_transactions = test_transactions.unstack().reset_index().dropna(axis=1)
test_transactions.columns = ['mcc_code', 'day']
test_transactions.head()

Unnamed: 0,mcc_code,day
0,4814,457
1,4814,458
2,4814,459
3,4814,460
4,4814,461


In [5]:
train_grid = pd.DataFrame(columns=train_transactions.mcc_code.unique(), 
                          index=train_transactions.day.unique())
train_grid = train_grid.unstack().reset_index().dropna(axis=1)
train_grid.columns = ['mcc_code', 'day']
train_grid.head()

Unnamed: 0,mcc_code,day
0,4814,0
1,4814,1
2,4814,2
3,4814,3
4,4814,4


Сделаем дополнительные признаки, касающиеся даты

In [6]:
for tr_table in tqdm([train_transactions, test_transactions, train_grid]):
    tr_table['week_num'] = tr_table['day'] // 7
    tr_table['week_day'] = tr_table['day'] % 7
    tr_table['month_num'] = tr_table['day'] // 28
    tr_table['month_day'] = tr_table['day'] % 28
    
train_transactions.head()

100%|██████████| 3/3 [00:00<00:00,  2.54it/s]


Unnamed: 0,customer_id,tr_datetime,mcc_code,tr_type,amount,term_id,day,week_num,week_day,month_num,month_day
0,7391977,0 10:23:26,4814,1030,-2245.92,,0,0,0,0,0
2,7391977,1 10:20:56,4829,2330,-56147.89,,1,0,1,0,1
3,7391977,1 10:39:54,5499,1010,-1392.47,,1,0,1,0,1
4,7391977,2 15:33:42,5499,1010,-920.83,,2,0,2,0,2
5,7391977,2 15:53:49,5541,1010,-14643.37,,2,0,2,0,2


In [7]:
train_transactions = \
    pd.merge(train_grid,
             train_transactions.groupby([
                 'day', 'week_num', 'week_day', 'month_num', 'month_day', 'mcc_code'])[['amount']]\
                 .sum().reset_index(),
             how='left').fillna(0)
train_transactions.head()

Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,amount
0,4814,0,0,0,0,0,-11098744.26
1,4814,1,0,1,0,1,-7881825.53
2,4814,2,0,2,0,2,-6777480.45
3,4814,3,0,3,0,3,-9277943.73
4,4814,4,0,4,0,4,-9999757.21


Добавим признаки, которые показывают какой объём транзакций был месяц назад плюс минус один день.

In [8]:
for day_shift in tqdm([-1, 0, 1]):
    for month_shift in train_transactions.month_num.unique()[1:]:
        train_shift = train_transactions.copy()
        train_shift['month_num'] += month_shift
        train_shift['month_day'] += day_shift
        train_shift['amount_day_{}_{}'.format(day_shift, month_shift)] = np.log(-train_shift['amount'] + 1)
        train_shift = train_shift[['month_num', 'month_day', 'mcc_code', 'amount_day_{}_{}'.format(day_shift, month_shift)]]

        train_transactions = pd.merge(train_transactions, train_shift, 
                                      on=['month_num', 'month_day', 'mcc_code'], how='left').fillna(0)
        test_transactions = pd.merge(test_transactions, train_shift, 
                                     on=['month_num', 'month_day', 'mcc_code'], how='left').fillna(0)
        
train_transactions.head()

100%|██████████| 3/3 [00:05<00:00,  1.69s/it]


Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,amount,amount_day_-1_1,amount_day_-1_2,amount_day_-1_3,...,amount_day_1_7,amount_day_1_8,amount_day_1_9,amount_day_1_10,amount_day_1_11,amount_day_1_12,amount_day_1_13,amount_day_1_14,amount_day_1_15,amount_day_1_16
0,4814,0,0,0,0,0,-11098744.26,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4814,1,0,1,0,1,-7881825.53,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4814,2,0,2,0,2,-6777480.45,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4814,3,0,3,0,3,-9277943.73,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4814,4,0,4,0,4,-9999757.21,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Также добавим OHE кодирование для категорильного признака mcc_code

In [9]:
train = pd.get_dummies(train_transactions, columns=['mcc_code'])
test = pd.get_dummies(test_transactions, columns=['mcc_code'])
train.head()

Unnamed: 0,day,week_num,week_day,month_num,month_day,amount,amount_day_-1_1,amount_day_-1_2,amount_day_-1_3,amount_day_-1_4,...,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402
0,0,0,0,0,0,-11098744.26,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,1,-7881825.53,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,2,0,2,-6777480.45,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,3,0,3,-9277943.73,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,4,0,4,-9999757.21,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
test.head()

Unnamed: 0,day,week_num,week_day,month_num,month_day,amount_day_-2_1,amount_day_-2_2,amount_day_-2_3,amount_day_-2_4,amount_day_-2_5,...,mcc_code_8299,mcc_code_8398,mcc_code_8641,mcc_code_8699,mcc_code_8999,mcc_code_9211,mcc_code_9222,mcc_code_9311,mcc_code_9399,mcc_code_9402
0,457,65,2,15,7,15.909834,16.309198,16.124614,16.217592,16.18825,...,0,0,0,0,0,0,0,0,0,0
1,458,65,3,15,8,16.217647,15.970313,16.174195,16.118953,15.884278,...,0,0,0,0,0,0,0,0,0,0
2,459,65,4,15,9,16.244498,15.857968,16.237173,16.331961,16.19614,...,0,0,0,0,0,0,0,0,0,0
3,460,65,5,15,10,16.301091,16.236765,16.172419,16.342454,16.202559,...,0,0,0,0,0,0,0,0,0,0
4,461,65,6,15,11,16.257208,16.208697,15.931393,16.369126,16.277556,...,0,0,0,0,0,0,0,0,0,0


In [11]:
c = train.columns.difference(['amount'])

clf = LinearRegression(n_jobs=-1)
clf.fit(train[c], np.log1p(-train['amount']))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [23]:
test_transactions['volume'] = np.expm1(clf.predict(test[c]))
test_transactions.head()

Unnamed: 0,mcc_code,day,week_num,week_day,month_num,month_day,amount_day_-1_1,amount_day_-1_2,amount_day_-1_3,amount_day_-1_4,...,amount_day_1_7,amount_day_1_8,amount_day_1_9,amount_day_1_10,amount_day_1_11,amount_day_1_12,amount_day_1_13,amount_day_1_14,amount_day_1_15,volume
0,4814,457,65,2,15,7,16.000073,16.239146,16.396831,15.931877,...,15.972526,16.170612,16.114276,15.794399,16.125836,16.102922,16.16779,15.776396,16.072431,11421630.0
1,4814,458,65,3,15,8,15.909834,16.309198,16.124614,16.217592,...,15.847769,16.263352,16.10872,15.775248,16.039853,16.026005,16.161532,15.68514,16.022031,11906370.0
2,4814,459,65,4,15,9,16.217647,15.970313,16.174195,16.118953,...,16.121188,15.975065,16.122443,15.935854,15.728413,16.105708,16.155763,16.057572,15.849065,12147140.0
3,4814,460,65,5,15,10,16.244498,15.857968,16.237173,16.331961,...,16.068822,16.297282,16.120781,15.916387,16.185141,15.929218,16.113109,16.048617,15.718409,12110120.0
4,4814,461,65,6,15,11,16.301091,16.236765,16.172419,16.342454,...,16.17033,15.910726,15.95624,15.844803,16.16477,15.779407,16.18501,16.087859,16.157151,12758090.0


In [25]:
test_transactions['id'] = test_transactions[['mcc_code', 'day']].apply(lambda x: '-'.join(map(str, x)), axis=1)
test_transactions[['id', 'volume']].to_csv('baseline.csv', index=False)
test_transactions[['id', 'volume']].head()

Unnamed: 0,id,volume
0,4814-457,11421630.0
1,4814-458,11906370.0
2,4814-459,12147140.0
3,4814-460,12110120.0
4,4814-461,12758090.0


### Just change clf and days radius

In [49]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=250, nthread=-1, reg_alpha=1, reg_lambda=1)
%time clf.fit(train[c], np.log1p(-train['amount']))
xgb_test = test_transactions.drop(['id', 'volume'], 1).copy()
xgb_test['volume'] = np.expm1(clf.predict(test[c]))
xgb_test['id'] = xgb_test[['mcc_code', 'day']].apply(lambda x: '-'.join(map(str, x)), axis=1)
xgb_test[['id', 'volume']].to_csv('xgb_baseline2.csv', index=False)
xgb_test[['id', 'volume']].head()

CPU times: user 29min 36s, sys: 39.1 s, total: 30min 15s
Wall time: 1min 54s


Unnamed: 0,id,volume
0,4814-457,8405468.0
1,4814-458,10635910.0
2,4814-459,10114323.0
3,4814-460,9813165.0
4,4814-461,10446837.0


In [13]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=450, nthread=-1, reg_alpha=1, reg_lambda=1)
%time clf.fit(train[c], np.log1p(-train['amount']))
try:
    xgb_test = test_transactions.drop(['id', 'volume'], 1).copy()
except:
    xgb_test = test_transactions.copy()
xgb_test['volume'] = np.expm1(clf.predict(test[c]))
xgb_test['id'] = xgb_test[['mcc_code', 'day']].apply(lambda x: '-'.join(map(str, x)), axis=1)
xgb_test[['id', 'volume']].to_csv('xgb_baseline2_with_5days.csv', index=False)
xgb_test[['id', 'volume']].head()

CPU times: user 44min 59s, sys: 52.1 s, total: 45min 52s
Wall time: 2min 53s


Unnamed: 0,id,volume
0,4814-457,8554625.0
1,4814-458,11198162.0
2,4814-459,10376439.0
3,4814-460,10233899.0
4,4814-461,10938486.0


In [47]:
from xgboost import XGBRegressor
clf = XGBRegressor(learning_rate=0.05, max_depth=8, n_estimators=250, nthread=-1, reg_alpha=1, reg_lambda=1)
%time clf.fit(train[train.day>=30][c], np.log1p(-train[train.day>=30]['amount']))
xgb_test = test_transactions.drop(['id', 'volume'], 1).copy()
xgb_test['volume'] = np.expm1(clf.predict(test[c]))
xgb_test['id'] = xgb_test[['mcc_code', 'day']].apply(lambda x: '-'.join(map(str, x)), axis=1)
xgb_test[['id', 'volume']].to_csv('xgb_baseline2-1.csv', index=False)
xgb_test[['id', 'volume']].head()

CPU times: user 22min 36s, sys: 26 s, total: 23min 2s
Wall time: 1min 27s


Unnamed: 0,id,volume
0,4814-457,9544903.0
1,4814-458,10848891.0
2,4814-459,10718572.0
3,4814-460,10467421.0
4,4814-461,10623178.0
