<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

# Predict test.csv

In [2]:
import os
import pandas as pd
import datetime
import pickle
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier

In [2]:
PROJECT_ROOT = os.path.join(os.getcwd(), '..')
DATA_DIR = os.path.join(PROJECT_ROOT,'data')
MODEL_PATH = os.path.join(PROJECT_ROOT,'model')

In [3]:
def preprocess(df):
    df['day'] = df.click_time.dt.day
    df['hour'] = df.click_time.dt.hour
    df['minute'] = df.click_time.dt.minute
    df['second'] = df.click_time.dt.second
    
    return df

In [4]:
def get_now():
    now = datetime.datetime.now()
    return '{0:%Y-%m-%d %H:%M:%S}'.format(now)

In [31]:
def predict(model):
    
    reader = pd.read_csv(os.path.join(DATA_DIR,'test.csv'), parse_dates=['click_time'], chunksize=100000)
    for i, test in enumerate(reader):
        print('[{}]Start:Preprocessing Data:Size:{}'.format(get_now(), len(test)))
        test = preprocess(test)
        print('[{}]Finish:Preprocessing Data:Size:{}'.format(get_now(), len(test)))
        
        print('[{}]Start:Predicting Data'.format(get_now()))
        X = test.drop(['click_id', 'ip','click_time'], axis=1)
        y_prob = model.predict_proba(X)
        y_class_one = [i[1] for i in y_prob]
        print('[{}]Finish:Predicting Data'.format(get_now()))
       
        print('[{}]Start:output Data'.format(get_now()))
        y = pd.DataFrame({
                                        'click_id' : test['click_id'],
                                        'is_attributed' : y_class_one
                                        })
        
        output = os.path.join(DATA_DIR, 'submission.csv')
        if i == 0:
            if os.path.isfile(output):
                os.remove(output)
            header = True
        else:
            header = False
        y.to_csv(output, index=False, header=header, mode='a')
        print('[{}]Finish:output Data'.format(get_now()))
    
    print('[{}]Finish:All Process'.format(get_now()))

In [32]:
with open(os.path.join(MODEL_PATH, 'model.pickle'), mode='rb') as f:
    model = pickle.load(f)

In [33]:
predict(model)

[2018-04-22 03:18:03]Start:Preprocessing Data:Size:100000
[2018-04-22 03:18:03]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:18:03]Start:Predicting Data
[2018-04-22 03:19:13]Finish:Predicting Data
[2018-04-22 03:19:13]Start:output Data
[2018-04-22 03:19:14]Finish:output Data
[2018-04-22 03:19:14]Start:Preprocessing Data:Size:100000
[2018-04-22 03:19:14]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:19:14]Start:Predicting Data
[2018-04-22 03:20:24]Finish:Predicting Data
[2018-04-22 03:20:24]Start:output Data
[2018-04-22 03:20:25]Finish:output Data
[2018-04-22 03:20:25]Start:Preprocessing Data:Size:100000
[2018-04-22 03:20:25]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:20:25]Start:Predicting Data
[2018-04-22 03:21:38]Finish:Predicting Data
[2018-04-22 03:21:38]Start:output Data
[2018-04-22 03:21:39]Finish:output Data
[2018-04-22 03:21:39]Start:Preprocessing Data:Size:100000
[2018-04-22 03:21:39]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:21:39]Start:Predict

[2018-04-22 03:49:55]Start:Preprocessing Data:Size:100000
[2018-04-22 03:49:55]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:49:55]Start:Predicting Data
[2018-04-22 03:50:57]Finish:Predicting Data
[2018-04-22 03:50:57]Start:output Data
[2018-04-22 03:50:58]Finish:output Data
[2018-04-22 03:50:58]Start:Preprocessing Data:Size:100000
[2018-04-22 03:50:58]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:50:58]Start:Predicting Data
[2018-04-22 03:52:01]Finish:Predicting Data
[2018-04-22 03:52:01]Start:output Data
[2018-04-22 03:52:01]Finish:output Data
[2018-04-22 03:52:02]Start:Preprocessing Data:Size:100000
[2018-04-22 03:52:02]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:52:02]Start:Predicting Data
[2018-04-22 03:53:05]Finish:Predicting Data
[2018-04-22 03:53:05]Start:output Data
[2018-04-22 03:53:06]Finish:output Data
[2018-04-22 03:53:06]Start:Preprocessing Data:Size:100000
[2018-04-22 03:53:06]Finish:Preprocessing Data:Size:100000
[2018-04-22 03:53:06]Start:Predict

[2018-04-22 04:20:57]Start:Preprocessing Data:Size:100000
[2018-04-22 04:20:57]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:20:57]Start:Predicting Data
[2018-04-22 04:21:57]Finish:Predicting Data
[2018-04-22 04:21:57]Start:output Data
[2018-04-22 04:21:57]Finish:output Data
[2018-04-22 04:21:57]Start:Preprocessing Data:Size:100000
[2018-04-22 04:21:57]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:21:57]Start:Predicting Data
[2018-04-22 04:22:58]Finish:Predicting Data
[2018-04-22 04:22:58]Start:output Data
[2018-04-22 04:22:58]Finish:output Data
[2018-04-22 04:22:58]Start:Preprocessing Data:Size:100000
[2018-04-22 04:22:59]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:22:59]Start:Predicting Data
[2018-04-22 04:23:59]Finish:Predicting Data
[2018-04-22 04:23:59]Start:output Data
[2018-04-22 04:24:00]Finish:output Data
[2018-04-22 04:24:00]Start:Preprocessing Data:Size:100000
[2018-04-22 04:24:00]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:24:00]Start:Predict

[2018-04-22 04:52:44]Start:Preprocessing Data:Size:100000
[2018-04-22 04:52:44]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:52:44]Start:Predicting Data
[2018-04-22 04:53:47]Finish:Predicting Data
[2018-04-22 04:53:47]Start:output Data
[2018-04-22 04:53:47]Finish:output Data
[2018-04-22 04:53:47]Start:Preprocessing Data:Size:100000
[2018-04-22 04:53:47]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:53:47]Start:Predicting Data
[2018-04-22 04:54:47]Finish:Predicting Data
[2018-04-22 04:54:47]Start:output Data
[2018-04-22 04:54:48]Finish:output Data
[2018-04-22 04:54:48]Start:Preprocessing Data:Size:100000
[2018-04-22 04:54:48]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:54:48]Start:Predicting Data
[2018-04-22 04:55:53]Finish:Predicting Data
[2018-04-22 04:55:53]Start:output Data
[2018-04-22 04:55:54]Finish:output Data
[2018-04-22 04:55:54]Start:Preprocessing Data:Size:100000
[2018-04-22 04:55:54]Finish:Preprocessing Data:Size:100000
[2018-04-22 04:55:54]Start:Predict

[2018-04-22 05:24:22]Start:Preprocessing Data:Size:100000
[2018-04-22 05:24:22]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:24:22]Start:Predicting Data
[2018-04-22 05:25:26]Finish:Predicting Data
[2018-04-22 05:25:26]Start:output Data
[2018-04-22 05:25:27]Finish:output Data
[2018-04-22 05:25:27]Start:Preprocessing Data:Size:100000
[2018-04-22 05:25:27]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:25:27]Start:Predicting Data
[2018-04-22 05:26:27]Finish:Predicting Data
[2018-04-22 05:26:27]Start:output Data
[2018-04-22 05:26:28]Finish:output Data
[2018-04-22 05:26:28]Start:Preprocessing Data:Size:100000
[2018-04-22 05:26:28]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:26:28]Start:Predicting Data
[2018-04-22 05:27:35]Finish:Predicting Data
[2018-04-22 05:27:35]Start:output Data
[2018-04-22 05:27:35]Finish:output Data
[2018-04-22 05:27:36]Start:Preprocessing Data:Size:100000
[2018-04-22 05:27:36]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:27:36]Start:Predict

[2018-04-22 05:55:46]Start:Preprocessing Data:Size:100000
[2018-04-22 05:55:46]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:55:46]Start:Predicting Data
[2018-04-22 05:56:48]Finish:Predicting Data
[2018-04-22 05:56:48]Start:output Data
[2018-04-22 05:56:49]Finish:output Data
[2018-04-22 05:56:49]Start:Preprocessing Data:Size:100000
[2018-04-22 05:56:49]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:56:49]Start:Predicting Data
[2018-04-22 05:57:55]Finish:Predicting Data
[2018-04-22 05:57:55]Start:output Data
[2018-04-22 05:57:56]Finish:output Data
[2018-04-22 05:57:56]Start:Preprocessing Data:Size:100000
[2018-04-22 05:57:56]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:57:56]Start:Predicting Data
[2018-04-22 05:58:59]Finish:Predicting Data
[2018-04-22 05:58:59]Start:output Data
[2018-04-22 05:58:59]Finish:output Data
[2018-04-22 05:58:59]Start:Preprocessing Data:Size:100000
[2018-04-22 05:58:59]Finish:Preprocessing Data:Size:100000
[2018-04-22 05:58:59]Start:Predict

[2018-04-22 06:27:03]Start:Preprocessing Data:Size:100000
[2018-04-22 06:27:03]Finish:Preprocessing Data:Size:100000
[2018-04-22 06:27:03]Start:Predicting Data
[2018-04-22 06:28:10]Finish:Predicting Data
[2018-04-22 06:28:10]Start:output Data
[2018-04-22 06:28:10]Finish:output Data
[2018-04-22 06:28:10]Start:Preprocessing Data:Size:100000
[2018-04-22 06:28:10]Finish:Preprocessing Data:Size:100000
[2018-04-22 06:28:10]Start:Predicting Data
[2018-04-22 06:29:15]Finish:Predicting Data
[2018-04-22 06:29:15]Start:output Data
[2018-04-22 06:29:16]Finish:output Data
[2018-04-22 06:29:16]Start:Preprocessing Data:Size:100000
[2018-04-22 06:29:16]Finish:Preprocessing Data:Size:100000
[2018-04-22 06:29:16]Start:Predicting Data
[2018-04-22 06:30:18]Finish:Predicting Data
[2018-04-22 06:30:18]Start:output Data
[2018-04-22 06:30:18]Finish:output Data
[2018-04-22 06:30:18]Start:Preprocessing Data:Size:100000
[2018-04-22 06:30:19]Finish:Preprocessing Data:Size:100000
[2018-04-22 06:30:19]Start:Predict

In [11]:
model.classes_

array([0, 1])

In [13]:
import numpy as np
checking = np.array([
    [1,2],
    [1,2],
    [1,2],
    [1,2],
    [1,2]
]
)

In [21]:
[i[1] for i in checking]

[2, 2, 2, 2, 2]