In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib
from time import time
from time import sleep

pd.set_option('display.max_columns', None)

# Load Data

In [2]:
dsdir = 'dataset/coupon-purchase-prediction'

In [3]:
train = joblib.load('CPP_REPRO_cl_train.pkl').sample(frac=1, random_state=0).reset_index(drop=True)
test =  joblib.load('CPP_REPRO_cl_test.pkl')
submission = pd.read_csv(os.path.join(dsdir,'sample_submission.csv'))

# Preprocess Some Columns

In [4]:
train.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)
test.drop(['DISPFROM', 'DISPEND', 'VALIDFROM', 'VALIDEND', 'REG_DATE', 'WITHDRAW_DATE'], axis=1, inplace=True)

In [5]:
#train.drop(['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)
#test.drop(['USABLE_DATE_MON', 'USABLE_DATE_TUE', 'USABLE_DATE_WED', 'USABLE_DATE_THU', 'USABLE_DATE_FRI', 'USABLE_DATE_SAT', 'USABLE_DATE_SUN', 'USABLE_DATE_HOLIDAY', 'USABLE_DATE_BEFORE_HOLIDAY'], axis=1, inplace=True)

In [6]:
x_train = train.drop(['USER_ID_hash', 'COUPON_ID_hash', 'TARGET'],axis=1)
y_train = train.TARGET.values.reshape(-1)

x_test = test.drop(['USER_ID_hash', 'COUPON_ID_hash'],axis=1)

In [7]:
categoricals = x_train.dtypes[x_train.dtypes == 'category'].index.tolist()

In [8]:
categoricals

['CAPSULE_TEXT',
 'GENRE_NAME',
 'LARGE_AREA_NAME',
 'PREF_NAME_COUPON',
 'SMALL_AREA_NAME',
 'SEX_ID',
 'PREF_NAME_USER']

In [9]:
x_train_d = pd.get_dummies(x_train, columns=categoricals)
x_test_d = pd.get_dummies(x_test, columns=categoricals)

x_train_d, x_test_d = x_train_d.align(x_test_d, join='left', axis=1)

x_train_d.fillna(0,inplace=True)
x_test_d.fillna(0,inplace=True)

# Train

In [10]:
rfc = RandomForestClassifier(random_state=0, verbose=1, n_jobs=-1, n_estimators=240, class_weight="balanced_subsample")

In [11]:
rfc.fit(x_train_d, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   55.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:  5.6min finished


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=240, n_jobs=-1, oob_score=False,
                       random_state=0, verbose=1, warm_start=False)

In [12]:
rfc.classes_

array([0, 1], dtype=int64)

In [13]:
joblib.dump(rfc,'CPP_REPO_RF.mdl')

['CPP_REPO_RF.mdl']

# Predict

In [14]:
%%time
y_pred = rfc.predict_proba(x_test_d)

[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    5.4s
[Parallel(n_jobs=12)]: Done 176 tasks      | elapsed:   29.2s
[Parallel(n_jobs=12)]: Done 240 out of 240 | elapsed:   39.8s finished


Wall time: 4min 18s


# Convert to Submission Format

In [15]:
sub = test[['USER_ID_hash','COUPON_ID_hash']]
sub['TARGET'] = y_pred[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [16]:
grouped = sub.groupby('USER_ID_hash')

In [17]:
def get_top10(row):
    pred = grouped.get_group(row.USER_ID_hash).sort_values(by=['TARGET'],ascending=False)
    pred = ' '.join(map(str, pred.head(10).COUPON_ID_hash.values))
    return pred

In [18]:
submission['PURCHASED_COUPONS'] = submission.apply(get_top10, axis=1)

In [19]:
sub_name = 'sub_CPP_REPRO_RF_'+str(int(time()))+'.csv'

In [20]:
submission.to_csv(sub_name, index=False)

In [21]:
submission

Unnamed: 0,USER_ID_hash,PURCHASED_COUPONS
0,0000b53e182165208887ba65c079fc21,281326ffac6d5dd2eec24f7bde0078d7 98dc8da9b3c7c...
1,00035b86e6884589ec8d28fbf2fe7757,fc5f052a1bd97696fbcab35d8d974b73 f5a77f2907876...
2,0005b1068d5f2b8f2a7c978fcfe1ca06,96d275611439e0029db651a914965ea3 9bfec8781a9c9...
3,000cc06982785a19e2a2fdb40b1c9d59,79de77aa8c36fdf17cb3366e2084e353 0e917a0e87224...
4,0013518e41c416cd6a181d277dd8ca0b,c988d799bc7db9254fe865ee6cf2d4ff 3b4635dc97035...
...,...,...
22868,fff1a623187cefd7a594e338709b0f40,51da52d5516033bea13972588b671184 42cc500acba3c...
22869,fff4a076cfda6ff9dbe85e1cb678791b,79de77aa8c36fdf17cb3366e2084e353 8c470d8651dbc...
22870,fff970d2014c3e10a77e38d540239017,0acc89ba7593ed787e3439b7f05884c2 f0685cf6de3c1...
22871,fffafc024e264d5d539813444cf61199,8e14f145efa75ff816ff9543c063eeb1 27741884a086e...


In [22]:
!kaggle competitions submit -c coupon-purchase-prediction -f $sub_name -m $sub_name

Successfully submitted to Coupon Purchase Prediction



  0%|          | 0.00/7.94M [00:00<?, ?B/s]
  1%|1         | 88.0k/7.94M [00:00<00:09, 866kB/s]
  2%|1         | 152k/7.94M [00:00<00:10, 789kB/s] 
  3%|2         | 224k/7.94M [00:00<00:10, 766kB/s]
  3%|3         | 272k/7.94M [00:01<00:45, 178kB/s]
  4%|3         | 312k/7.94M [00:01<01:06, 120kB/s]
  4%|4         | 352k/7.94M [00:02<01:10, 113kB/s]
  5%|4         | 392k/7.94M [00:02<01:16, 104kB/s]
  5%|5         | 432k/7.94M [00:02<01:17, 102kB/s]
  6%|5         | 464k/7.94M [00:03<01:21, 96.0kB/s]
  6%|5         | 480k/7.94M [00:03<01:12, 108kB/s] 
  6%|6         | 504k/7.94M [00:03<01:31, 85.5kB/s]
  7%|6         | 536k/7.94M [00:04<01:11, 109kB/s] 
  7%|6         | 560k/7.94M [00:04<01:28, 88.0kB/s]
  7%|7         | 584k/7.94M [00:04<01:42, 75.5kB/s]
  8%|7         | 616k/7.94M [00:04<01:20, 95.6kB/s]
  8%|7         | 632k/7.94M [00:05<02:03, 62.2kB/s]
  8%|8         | 672k/7.94M [00:05<01:31, 83.1kB/s]
  9%|8         | 696k/7.94M [00:06<01:46, 71.5kB/s]
  9%|8         | 720k/7.9

In [24]:
found = False
while 1:
    submission_list = !kaggle competitions submissions -c coupon-purchase-prediction
    for sub_row in submission_list:
        if sub_name in sub_row and 'complete' in sub_row:
            scores = sub_row[sub_row.find('complete')+10:].split()
            print('Private :',scores[1],'\t|\tPublic :',scores[0])
            found = True
            break
    if found:
        break
    sleep(10)

Private : 0.00584 	|	Public : 0.00585
