In [16]:
import pandas as pd
import numpy as np
import gc

path = "./"

dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }

In [9]:
from keras.layers import Input, Embedding, SimpleRNN, Dense, concatenate, SpatialDropout1D, LSTM
from keras.models import Model
from keras.callbacks import TensorBoard

embedding_n = 50
in_ip = Input(shape=[1], name = 'ip')                           #embed each data point in 50 dimensional space
emb_ip = Embedding(364779, embedding_n)(in_ip)                  #otherwise turn each data like 1232(ip) to dense 
in_app = Input(shape=[1], name = 'app')                         #vector of 50 dimensions
emb_app = Embedding(769, embedding_n)(in_app)
in_device = Input(shape=[1], name = 'device')
emb_device = Embedding(4228, embedding_n)(in_device)
in_os = Input(shape=[1], name = 'os')
emb_os = Embedding(957, embedding_n)(in_os)
in_channel = Input(shape=[1], name = 'channel')
emb_channel = Embedding(501, embedding_n)(in_channel) 
in_hour = Input(shape=[1], name = 'hour')
emb_hour = Embedding(24, embedding_n)(in_hour)
in_day = Input(shape=[1], name = 'day')
emb_day = Embedding(10, embedding_n)(in_day)
in_wday = Input(shape=[1], name = 'wday')
emb_wday = Embedding(4, embedding_n)(in_wday)

In [13]:
embedded_n = concatenate([(emb_ip),(emb_app),(emb_device),  #concatenate dense vector outputs of each feature
                        (emb_os),(emb_channel),
                        (emb_hour),(emb_day),(emb_wday)])

s_dout = SpatialDropout1D(0.2)(embedded_n)                #partial spartial dropout upon concatenation layer
lstm1 = LSTM(128,return_sequences=True)(s_dout)                                 #flatten data array 
lstm2 = LSTM(128)(lstm1)
out = Dense(1, activation='sigmoid')(lstm2)
model = Model(inputs=[in_ip,in_app,in_device,in_os,in_channel,in_hour,in_day,in_wday], outputs=out)
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
ip (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
app (InputLayer)                (None, 1)            0                                            
__________________________________________________________________________________________________
device (InputLayer)             (None, 1)            0                                            
__________________________________________________________________________________________________
os (InputLayer)                 (None, 1)            0                                            
__________________________________________________________________________________________________
channel (I

In [14]:
model.load_weights('RNN_alpha_LSTM.h5')

In [17]:
data = pd.read_csv(path+"train.csv", dtype=dtypes, usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])

In [19]:
test = data[131886954:]

In [23]:
data.size

1294327230

In [24]:
test.size

371118552

In [25]:
del data

In [27]:
gc.collect()

166

In [28]:
test.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
131886954,34684,2,1,13,469,2017-11-09 00:00:00,0
131886955,207368,26,1,19,477,2017-11-09 00:00:00,0
131886956,110176,18,1,8,121,2017-11-09 00:00:00,0
131886957,109644,12,1,19,265,2017-11-09 00:00:00,0
131886958,7517,2,1,19,477,2017-11-09 00:00:00,0


In [29]:
test['hour'] = pd.to_datetime(test.click_time).dt.hour.astype('uint8')
test['day'] = pd.to_datetime(test.click_time).dt.day.astype('uint8')
test['wday']  = pd.to_datetime(test.click_time).dt.dayofweek.astype('uint8')

In [30]:
test.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day,wday
131886954,34684,2,1,13,469,2017-11-09 00:00:00,0,0,9,3
131886955,207368,26,1,19,477,2017-11-09 00:00:00,0,0,9,3
131886956,110176,18,1,8,121,2017-11-09 00:00:00,0,0,9,3
131886957,109644,12,1,19,265,2017-11-09 00:00:00,0,0,9,3
131886958,7517,2,1,19,477,2017-11-09 00:00:00,0,0,9,3


In [31]:
test.drop([ 'click_time'],1,inplace=True)

In [32]:
test.head()

Unnamed: 0,ip,app,device,os,channel,is_attributed,hour,day,wday
131886954,34684,2,1,13,469,0,0,9,3
131886955,207368,26,1,19,477,0,0,9,3
131886956,110176,18,1,8,121,0,0,9,3
131886957,109644,12,1,19,265,0,0,9,3
131886958,7517,2,1,19,477,0,0,9,3


In [33]:
test_x = pd.DataFrame()

In [34]:
test_x['is_attributed'] = test['is_attributed']

In [35]:
test_x.head()

Unnamed: 0,is_attributed
131886954,0
131886955,0
131886956,0
131886957,0
131886958,0


In [36]:
test.drop([ 'is_attributed'],1,inplace=True)

In [37]:
test.head()

Unnamed: 0,ip,app,device,os,channel,hour,day,wday
131886954,34684,2,1,13,469,0,9,3
131886955,207368,26,1,19,477,0,9,3
131886956,110176,18,1,8,121,0,9,3
131886957,109644,12,1,19,265,0,9,3
131886958,7517,2,1,19,477,0,9,3


In [38]:
def data_stream_molder(dataset):
    X = {
		'ip': np.array(dataset.ip),
        'app': np.array(dataset.app),
        'channel': np.array(dataset.channel),
        'device': np.array(dataset.device),
        'os': np.array(dataset.os),
        'hour': np.array(dataset.hour),
        'day': np.array(dataset.day),
        'wday': np.array(dataset.wday),
    }
    return X
test = data_stream_molder(test)

In [40]:
test_y = pd.DataFrame()
test_y.head()

In [41]:
test_y['is_attributed'] = model.predict(test, batch_size=20000,verbose=2)

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series