#### WorkFlow in this notebook
- For each variable **ip**, **os**, **channel**, **device**, **app** save the sequence of *is_attributed* and *time_difference* in numpy arrays

In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
data_path = "../input/"
train_path = data_path+"train.csv" 
processed_data_path = "../input/processed/"
#test = "train_sample.csv"

## column dtypes
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        'hourofday'     : 'uint8',
        'dayofweek'     : 'uint8',
        'ip_device_os'     : 'uint32',
        'ip_device_os_app'     : 'uint32',
        'ip_device_os_app_channel' : 'uint32',
        }

time_series_length = 1000

In [3]:
def pad_1d(array, max_len):
    array = array[len(array)-max_len:]
    length = len(array)
    padded = [-1]*(max_len - len(array)) + array
    return padded, length

def time_details(df):
    df['epoch_time'] = ((pd.to_datetime(df['click_time']) - pd.to_datetime("2017-11-06 14:00:00"))).astype(np.int64)//10**9
#     df['seconds'] = (df['epoch_time'] % 60).astype(np.uint8)
#     df['epoch_minute'] = (df['epoch_time'] // 60).astype(np.uint32)
#     df['minutes'] = (df['epoch_minute'] % 60).astype(np.uint8)
    
    del df['click_time']
    return df

In [4]:
def preprocess_data(path,dtype = dtypes,column_name = "ip"):
    print("Processing feat", column_name)
    def parse_col(x):
        series = pd.Series()

        series['attributed_series'] = "_".join(x['is_attributed'].values.astype(str).tolist())
        series["time_delta"] = "_".join([str(s-t) for t,s in zip(x['epoch_time'].values.astype(int).tolist(), x['epoch_time'][1:].values.astype(int).tolist())])

        return series
    #load data
    df = pd.read_csv(path, dtype=dtype, usecols=[column_name, 'is_attributed', 'click_time'])
    df = time_details(df)
    
    df = df.sort_values(by = 'epoch_time')
    df = df.groupby(column_name, sort=False).apply(parse_col).reset_index()
    
    if not os.path.isdir('../input/processed'):
        os.makedirs('../input/processed')
    df.to_csv(os.path.join('../input/processed', column_name+'_data.csv'), index=False)

In [14]:
def create_data(column_name, processed_data_path = processed_data_path, time_series_length = time_series_length):
    df = pd.read_csv(processed_data_path+column_name+'_data.csv')
    num_rows = len(df)
    col_id = np.zeros(shape =(num_rows), dtype = np.int32)
    history_length = np.zeros(shape = (num_rows), dtype = np.int32)
    attributed_history = np.zeros(shape = (num_rows, time_series_length), dtype = np.int8)
    time_delta_history = np.zeros(shape = (num_rows, time_series_length), dtype = np.int32)
    is_next_downloaded = np.zeros(shape = (num_rows), dtype = np.int8)
    for i, row in df.iterrows():
        if i % 10000 == 0:
            print(i)
        col_id[i] = row[column_name]
        if  type(row['attributed_series']) is str:
            temp_list = [int(j) for j in row['attributed_series'].split('_')]
            if len(temp_list) > 1:
                attributed_series = temp_list[:-1]
                is_next_downloaded[i] = temp_list[-1]
            else:
                attributed_series = [-1]
                is_next_downloaded[i] = temp_list[0]
        attributed_history[i, :], history_length[i] = pad_1d(attributed_series, time_series_length)
        if type(row['time_delta']) is str:
            time_delta_series = [int(j) for j in row['time_delta'].split('_')][:-1]
        else:
            time_delta_series = [-1]
        time_delta_history[i, :], _ = pad_1d(time_delta_series, time_series_length)

    np.save(processed_data_path+column_name+'_id.npy', col_id)
    np.save(processed_data_path+column_name+'_attributed_history.npy', attributed_history)
    np.save(processed_data_path+column_name+'_time_delta_history.npy', time_delta_history)
    np.save(processed_data_path+column_name+'_next_download.npy', is_next_downloaded)
    np.save(processed_data_path+column_name+'_history_length.npy', history_length)

In [6]:
%%time
feat_cols = ['ip']
for feat in feat_cols:
    preprocess_data(path = train_path,dtype = dtypes,column_name = feat)
    create_data(column_name=feat, processed_data_path = processed_data_path, 
                time_series_length = time_series_length)

Processing feat ip
0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
CPU times: user 11min 39s, sys: 18.9 s, total: 11min 58s
Wall time: 12min 5s


In [15]:
create_data(column_name=feat, processed_data_path = processed_data_path, 
                time_series_length = time_series_length)

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000


In [16]:
np.load("../input/processed/ip_next_download.npy")

array([0, 0, 0, ..., 1, 1, 1], dtype=int8)