In [1]:
import os
import pandas as pd

PROJECT_ROOT = os.path.join(os.getcwd(), '..')
DATA_DIR = os.path.join(PROJECT_ROOT,'data')
MODEL_PATH = os.path.join(PROJECT_ROOT,'model')

In [10]:
test = pd.read_csv(os.path.join(DATA_DIR,'test.csv'), parse_dates=['click_time'])

In [11]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


# read train data (sampled)

In [2]:
train_sampled = pd.read_csv(os.path.join(DATA_DIR, 'train.csv.zip'), compression='zip', parse_dates=['click_time'])

# basic analysis

In [4]:
train_sampled.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,,0
1,17357,3,1,19,379,2017-11-06 14:33:34,,0
2,35810,3,1,13,379,2017-11-06 14:34:12,,0
3,45745,14,1,13,478,2017-11-06 14:34:52,,0
4,161007,3,1,13,379,2017-11-06 14:35:08,,0


In [5]:
train_sampled.is_attributed.value_counts()

0    998307
1      1693
Name: is_attributed, dtype: int64

In [6]:
train_sampled.ip.value_counts()

73487     6437
73516     6317
105560    3474
5314      3367
5348      3057
53454     3038
105475    2872
100275    2815
201182    2807
114276    2782
95766     2242
45745     1662
188387    1660
111025    1573
86767     1544
137052    1471
26995     1433
209663    1423
114220    1399
93054     1368
93021     1357
43793     1304
92735     1291
17149     1260
114314    1158
92766     1086
114235    1059
123994    1024
79857     1020
147957     942
          ... 
127103       1
89436        1
166416       1
212752       1
15829        1
150671       1
152381       1
195368       1
103026       1
88056        1
163992       1
128761       1
196393       1
155453       1
195615       1
173715       1
212007       1
82427        1
142978       1
173203       1
175250       1
25291        1
192809       1
133430       1
205239       1
176785       1
160393       1
147775       1
211728       1
142133       1
Name: ip, Length: 39611, dtype: int64

In [19]:
train_sampled.app.value_counts()

12     141851
3      136810
15     114425
2      114340
9      112038
18      82498
8       48917
14      35943
1       32070
6       20373
21      20321
20      17199
25      16643
24      16022
13      14694
64      12822
11      10413
23      10099
26       8337
27       5147
17       5140
28       5128
10       2852
19       2551
32       1669
22       1393
29       1213
5        1010
150       970
151       593
        ...  
326         1
305         1
302         1
299         1
610         1
104         1
286         1
645         1
126         1
90          1
322         1
153         1
278         1
268         1
203         1
549         1
561         1
222         1
223         1
226         1
54          1
184         1
61          1
173         1
69          1
240         1
73          1
257         1
206         1
563         1
Name: app, Length: 214, dtype: int64

# preprocess

In [3]:
def preprocess(df):
    df['day'] = df.click_time.dt.day
    df['hour'] = df.click_time.dt.hour
    df['minute'] = df.click_time.dt.minute
    df['second'] = df.click_time.dt.second
    
    return df

In [4]:
train_sampled = preprocess(train_sampled)

In [9]:
train_sampled.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,day,hour,minute,second
0,83230,3,1,13,379,2017-11-06 14:32:21,,0,6,14,32,21
1,17357,3,1,19,379,2017-11-06 14:33:34,,0,6,14,33,34
2,35810,3,1,13,379,2017-11-06 14:34:12,,0,6,14,34,12
3,45745,14,1,13,478,2017-11-06 14:34:52,,0,6,14,34,52
4,161007,3,1,13,379,2017-11-06 14:35:08,,0,6,14,35,8


# create model

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from imblearn.ensemble import BalancedBaggingClassifier

In [6]:
X = train_sampled.drop(['ip','click_time', 'attributed_time', 'is_attributed'], axis=1)
y = train_sampled['is_attributed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
bbc = BalancedBaggingClassifier(base_estimator=rf_model, n_estimators=100, ratio='not minority')
bbc.fit(X_train, y_train)

# evalution model

In [None]:
print(f'test score: {bbc.score(X_test, y_test)}')
#print(f'train score: {bbc.score(X_train, y_train)}')

In [14]:
y_pred = bbc.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[293279,   6213],
       [   133,    375]])

# Model Reconstruction

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, n_jobs=-1)
bbc = BalancedBaggingClassifier(base_estimator=rf_model, n_estimators=100, ratio='not minority', n_jobs=-1)
bbc.fit(X, y)

# Save Model as pickle

In [None]:
import pickle

with open(os.path.join(MODEL_PATH, 'model.pickle'), mode='wb') as f:
    pickle.dump(bbc, f)

# predict test.csv

In [33]:
test = preprocess(test)
test_X = test.drop(['click_id', 'ip','click_time'], axis=1)
y_submission = bbc.predict_proba(test_X)

KeyboardInterrupt: 

In [None]:
y_submission = pd.DataFrame({
    'click_id' : test['click_id'],
    'is_attributed' : y_submission
})

In [None]:
y_submission.to_csv(os.path.join(DATA_DIR, 'submission.csv'), index=False)

# Predict test data 
## Read - Preprocess - Output with Chunksize

In [None]:
def predict(model):
    