In [2]:
#!/usr/bin/env python
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from utils import FileReader

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [4]:
reader = FileReader()
files = ['./data/train.csv',
        './data/merchants.csv',
        './data/historical_transactions.csv',
        './data/new_merchant_transactions.csv']

data = reader.load_file(files)

train = data['train']
train['first_active_month'] = pd.to_datetime(train['first_active_month'])
train['year'] = train['first_active_month'].dt.year
train['month'] = train['first_active_month'].dt.month
train_x = train[['year', 'month', 'feature_1', 'feature_2', 'feature_3']]
train_y = train['target']

Loading file: train
Loading file: merchants
Loading file: historical_transactions
Loading file: new_merchant_transactions


In [5]:
merchants = data['merchants']
merchants.head(20)

Unnamed: 0,merchant_id,merchant_group_id,merchant_category_id,subsector_id,numerical_1,numerical_2,category_1,most_recent_sales_range,most_recent_purchases_range,avg_sales_lag3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id,state_id,category_2
0,M_ID_838061e48c,8353,792,9,-0.057471,-0.057471,N,E,E,-0.4,...,-2.25,18.666667,6,-2.32,13.916667,12,N,242,9,1.0
1,M_ID_9339d880ad,3184,840,20,-0.057471,-0.057471,N,E,E,-0.72,...,-0.74,1.291667,6,-0.57,1.6875,12,N,22,16,1.0
2,M_ID_e726bbae1e,447,690,1,-0.057471,-0.057471,N,E,E,-82.13,...,-82.13,260.0,2,-82.13,260.0,2,N,-1,5,5.0
3,M_ID_a70e9c5f81,5026,792,9,-0.057471,-0.057471,Y,E,E,,...,,4.666667,6,,3.833333,12,Y,-1,-1,
4,M_ID_64456c37ce,2228,222,21,-0.057471,-0.057471,Y,E,E,,...,,0.361111,6,,0.347222,12,Y,-1,-1,
5,M_ID_a0915f62b5,20201,87,27,-0.057471,-0.057471,N,E,E,,...,,3.666667,6,,3.833333,12,Y,160,21,5.0
6,M_ID_bfd41933db,33861,792,9,-0.057471,-0.057471,N,E,E,,...,,4.833333,6,,6.333333,12,N,60,16,1.0
7,M_ID_d8ff08219e,16430,529,20,-0.057471,-0.057471,Y,E,E,,...,,1.666667,6,,1.5,11,Y,-1,-1,
8,M_ID_c5b389236d,37179,813,29,-0.057471,-0.057471,N,E,E,,...,,189.916667,6,,197.0,7,N,248,15,1.0
9,M_ID_d2162ed113,112122,81,29,-0.057471,-0.057471,Y,E,E,,...,,1.0,2,,1.0,2,Y,-1,-1,


In [90]:
historical_transactions = data['historical_transactions']
historical_transactions.head(50)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37
5,Y,C_ID_4e6213e9bc,333,N,0,A,80,M_ID_50af771f8d,0,-0.734887,2018-02-24 08:45:05,1.0,9,37
6,Y,C_ID_4e6213e9bc,88,N,0,A,278,M_ID_5e8220e564,-11,-0.716855,2017-03-21 00:10:51,1.0,16,37
7,Y,C_ID_4e6213e9bc,3,N,0,A,80,M_ID_9d41786a50,-3,-0.657049,2017-11-18 20:05:55,1.0,16,37
8,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-8,-0.737967,2017-06-01 22:02:56,1.0,16,37
9,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_74ba14b5fc,-11,-0.715352,2017-03-16 15:41:22,1.0,16,37


In [87]:
new_merchant_transactions = data['new_merchant_transactions']
new_merchant_transactions.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_415bb3a509,107,N,1,B,307,M_ID_b0c793002c,1,-0.557574,2018-03-11 14:57:36,1.0,9,19
1,Y,C_ID_415bb3a509,140,N,1,B,307,M_ID_88920c89e8,1,-0.56958,2018-03-19 18:53:37,1.0,9,19
2,Y,C_ID_415bb3a509,330,N,1,B,507,M_ID_ad5237ef6b,2,-0.551037,2018-04-26 14:08:44,1.0,9,14
3,Y,C_ID_415bb3a509,-1,Y,1,B,661,M_ID_9e84cda3b1,1,-0.671925,2018-03-07 09:43:21,,-1,8
4,Y,C_ID_ef55cf8d4b,-1,Y,1,B,166,M_ID_3c86fa3831,1,-0.659904,2018-03-22 21:07:53,,-1,29


In [None]:
train_x, valid_x, train_y, valid_y = \
train_test_split(train_x, train_y, test_size=0.25, random_state=1234)
train_data = lgb.Dataset(train_x, label=train_y)
valid_data = lgb.Dataset(valid_x, label=valid_y, reference=train_data)

In [80]:
param = {
    'num_leaves': 63,
    'num_iterations': 3000,
    'learning_rate': 0.005,
    'lambda_l2': 0.02,
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1}

rt = lgb.train(param,
               train_data,
               valid_sets=[train_data, valid_data],
               early_stopping_rounds=200)

rt.save_model('./model/model.txt')

[1]	training's rmse: 3.86352	valid_1's rmse: 3.81067
Training until validation scores don't improve for 200 rounds.
[2]	training's rmse: 3.86339	valid_1's rmse: 3.81057
[3]	training's rmse: 3.86326	valid_1's rmse: 3.81047
[4]	training's rmse: 3.86314	valid_1's rmse: 3.81038
[5]	training's rmse: 3.86301	valid_1's rmse: 3.81028
[6]	training's rmse: 3.86289	valid_1's rmse: 3.81019
[7]	training's rmse: 3.86277	valid_1's rmse: 3.8101
[8]	training's rmse: 3.86265	valid_1's rmse: 3.81001
[9]	training's rmse: 3.86253	valid_1's rmse: 3.80992
[10]	training's rmse: 3.86241	valid_1's rmse: 3.80983
[11]	training's rmse: 3.8623	valid_1's rmse: 3.80975
[12]	training's rmse: 3.86218	valid_1's rmse: 3.80966
[13]	training's rmse: 3.86207	valid_1's rmse: 3.80957
[14]	training's rmse: 3.86195	valid_1's rmse: 3.80949
[15]	training's rmse: 3.86184	valid_1's rmse: 3.80941
[16]	training's rmse: 3.86173	valid_1's rmse: 3.80932
[17]	training's rmse: 3.86162	valid_1's rmse: 3.80924
[18]	training's rmse: 3.86151	

[152]	training's rmse: 3.85315	valid_1's rmse: 3.80394
[153]	training's rmse: 3.85312	valid_1's rmse: 3.80393
[154]	training's rmse: 3.85309	valid_1's rmse: 3.80391
[155]	training's rmse: 3.85306	valid_1's rmse: 3.8039
[156]	training's rmse: 3.85302	valid_1's rmse: 3.80388
[157]	training's rmse: 3.85299	valid_1's rmse: 3.80387
[158]	training's rmse: 3.85296	valid_1's rmse: 3.80385
[159]	training's rmse: 3.85293	valid_1's rmse: 3.80384
[160]	training's rmse: 3.8529	valid_1's rmse: 3.80382
[161]	training's rmse: 3.85287	valid_1's rmse: 3.80381
[162]	training's rmse: 3.85284	valid_1's rmse: 3.8038
[163]	training's rmse: 3.85281	valid_1's rmse: 3.80379
[164]	training's rmse: 3.85278	valid_1's rmse: 3.80377
[165]	training's rmse: 3.85275	valid_1's rmse: 3.80376
[166]	training's rmse: 3.85272	valid_1's rmse: 3.80375
[167]	training's rmse: 3.85269	valid_1's rmse: 3.80374
[168]	training's rmse: 3.85266	valid_1's rmse: 3.80373
[169]	training's rmse: 3.85263	valid_1's rmse: 3.80372
[170]	trainin

[305]	training's rmse: 3.85012	valid_1's rmse: 3.80335
[306]	training's rmse: 3.85011	valid_1's rmse: 3.80336
[307]	training's rmse: 3.8501	valid_1's rmse: 3.80336
[308]	training's rmse: 3.85009	valid_1's rmse: 3.80336
[309]	training's rmse: 3.85008	valid_1's rmse: 3.80336
[310]	training's rmse: 3.85007	valid_1's rmse: 3.80337
[311]	training's rmse: 3.85006	valid_1's rmse: 3.80337
[312]	training's rmse: 3.85005	valid_1's rmse: 3.80337
[313]	training's rmse: 3.85003	valid_1's rmse: 3.80338
[314]	training's rmse: 3.85002	valid_1's rmse: 3.80339
[315]	training's rmse: 3.85001	valid_1's rmse: 3.80338
[316]	training's rmse: 3.85	valid_1's rmse: 3.80339
[317]	training's rmse: 3.84999	valid_1's rmse: 3.80339
[318]	training's rmse: 3.84998	valid_1's rmse: 3.80339
[319]	training's rmse: 3.84997	valid_1's rmse: 3.8034
[320]	training's rmse: 3.84996	valid_1's rmse: 3.80339
[321]	training's rmse: 3.84995	valid_1's rmse: 3.8034
[322]	training's rmse: 3.84994	valid_1's rmse: 3.8034
[323]	training's 

[461]	training's rmse: 3.84883	valid_1's rmse: 3.8039
[462]	training's rmse: 3.84883	valid_1's rmse: 3.8039
[463]	training's rmse: 3.84882	valid_1's rmse: 3.80391
[464]	training's rmse: 3.84882	valid_1's rmse: 3.80391
[465]	training's rmse: 3.84881	valid_1's rmse: 3.80391
[466]	training's rmse: 3.8488	valid_1's rmse: 3.80391
[467]	training's rmse: 3.84879	valid_1's rmse: 3.80392
[468]	training's rmse: 3.84879	valid_1's rmse: 3.80392
[469]	training's rmse: 3.84878	valid_1's rmse: 3.80392
[470]	training's rmse: 3.84877	valid_1's rmse: 3.80393
[471]	training's rmse: 3.84877	valid_1's rmse: 3.80393
[472]	training's rmse: 3.84876	valid_1's rmse: 3.80393
[473]	training's rmse: 3.84876	valid_1's rmse: 3.80394
[474]	training's rmse: 3.84875	valid_1's rmse: 3.80394
[475]	training's rmse: 3.84875	valid_1's rmse: 3.80394
[476]	training's rmse: 3.84874	valid_1's rmse: 3.80394
[477]	training's rmse: 3.84873	valid_1's rmse: 3.80395
[478]	training's rmse: 3.84873	valid_1's rmse: 3.80395
[479]	trainin

<lightgbm.basic.Booster at 0x119f3f358>

In [15]:
data_test = reader.load_file(['./data/test.csv'], is_batch=True)

Loading file: test


In [20]:
data_test = data_test['test']
data_test['first_active_month'] = pd.to_datetime(data_test['first_active_month'])
data_test['year'] = data_test['first_active_month'].dt.year
data_test['month'] = data_test['first_active_month'].dt.month
test_x = data_test[['year', 'month', 'feature_1', 'feature_2', 'feature_3']]

In [63]:
submission_table = pd.DataFrame(columns = ['card_id', 'target'])
submission_table['card_id'] = data_test['card_id']
submission_table['target'] = rt.predict(test_x).reshape(-1,1)

In [69]:
submission_table.head(10)
submission_table.to_csv( './prediection/submission_version_%s.csv'% 1,index=None)