In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
data_path = "../../input/"
train_path = data_path+"train.csv" 
#test = "train_sample.csv"

## column dtypes
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        'hourofday'     : 'uint8',
        'dayofweek'     : 'uint8',
        'ip_device_os'     : 'uint32',
        'ip_device_os_app'     : 'uint32',
        'ip_device_os_app_channel' : 'uint32',
        }

time_series_length = 1000

In [3]:
def pad_1d(array, max_len):
    array = array[len(array)-max_len:]
    length = len(array)
    padded = [-1]*(max_len - len(array)) + array
    return padded, length

def time_details(df):
    df['epoch_time'] = ((pd.to_datetime(df['click_time']) - pd.to_datetime("2017-11-06 14:00:00"))).astype(np.int64)//10**9

    del df['click_time']
    return df

In [6]:
def preprocess_data(path,dtype = dtypes):
    print("Processing data")
    
    #load data
    df = pd.read_csv(path, dtype=dtype, usecols=['ip','app','device', 'os', 'channel','is_attributed', 'click_time'])
    df = time_details(df)
    num_rows = len(df)
    
    if not os.path.isdir('../../input/processed'):
        os.makedirs('../../input/processed')
        
    ip_id = np.memmap('../../input/processed/ip.npy', dtype=np.int32, mode='w+', shape=(num_rows))
    history_length = np.memmap('../../input/processed/history_length.npy', dtype=np.int32, mode='w+', shape=(num_rows))
    app_id_history = np.memmap('../../input/processed/app_history.npy', dtype=np.int32, mode='w+', shape=(num_rows, 20))
    os_id_history = np.memmap('../../input/processed/os_history.npy', dtype=np.int32, mode='w+', shape=(num_rows, 20))
    channel_id_history = np.memmap('../../input/processed/channel_history.npy', dtype=np.int32, mode='w+', shape=(num_rows, 20))
    device_id_history = np.memmap('../../input/processed/device_history.npy', dtype=np.int32, mode='w+', shape=(num_rows, 20))
    time_delta_history = np.memmap('../../input/processed/time_history.npy', dtype=np.int32, mode='w+', shape=(num_rows, 20))
    
    df = df.sort_values(by = 'epoch_time').reset_index(drop = True)
    for i, row in df.iterrows():
        if i % 10000 == 0:
            print(i)
        
        ip = row['ip']
        df_subset = df.iloc[:i][df.ip == ip]
        ip_id[i] = ip
        app_id_history[i, :], _ = pad_1d(df_subset.app.values.tolist(), 20)
        os_id_history[i, :], _ = pad_1d(df_subset.os.values.tolist(), 20)
        device_id_history[i, :], _ = pad_1d(df_subset.device.values.tolist(), 20)
        channel_id_history[i, :], _ = pad_1d(df_subset.channel.values.tolist(), 20)
        epoch_time_list = df_subset.epoch_time.values.tolist()
        time_delta_history[i, :], history_length[i] = pad_1d([s-t for t, s in zip(epoch_time_list, epoch_time_list[1:])], 20)
    
    if not os.path.isdir('../input/processed'):
        os.makedirs('../input/processed')
        
    np.save('../input/processed/ip.npy', ip_id)
    np.save('../input/processed/app_history.npy', app_id_history)
    np.save('../input/processed/os_history.npy', os_id_history)
    np.save('../input/processed/device_history.npy', device_id_history)
    np.save('../input/processed/channel_history.npy', channel_id_history)
    np.save('../input/processed/time_history.npy', time_delta_history)
    np.save('../input/processed/history_length.npy', history_length)

In [7]:
%%timeit
preprocess_data(train_path)

Processing data
0




KeyboardInterrupt: 

In [None]:
# def create_data(column_name, processed_data_path = processed_data_path, time_series_length = time_series_length):
#     df = pd.read_csv(processed_data_path+column_name+'_data.csv')
#     num_rows = len(df)
#     col_id = np.zeros(shape =(num_rows), dtype = np.int32)
#     history_length = np.zeros(shape = (num_rows), dtype = np.int32)
#     attributed_history = np.zeros(shape = (num_rows, time_series_length), dtype = np.int8)
#     time_delta_history = np.zeros(shape = (num_rows, time_series_length), dtype = np.int32)
#     is_next_downloaded = np.zeros(shape = (num_rows), dtype = np.int8)
#     for i, row in df.iterrows():
#         if i % 10000 == 0:
#             print(i)
#         col_id[i] = row[column_name]
#         if  type(row['attributed_series']) is str:
#             temp_list = [int(j) for j in row['attributed_series'].split('_')]
#             if len(temp_list) > 1:
#                 attributed_series = temp_list[:-1]
#                 is_next_downloaded[i] = temp_list[-1]
#             else:
#                 attributed_series = [-1]
#                 is_next_downloaded[i] = temp_list[0]
#         attributed_history[i, :], history_length[i] = pad_1d(attributed_series, time_series_length)
#         if type(row['time_delta']) is str:
#             time_delta_series = [int(j) for j in row['time_delta'].split('_')][:-1]
#         else:
#             time_delta_series = [-1]
#         time_delta_history[i, :], _ = pad_1d(time_delta_series, time_series_length)

#     np.save(processed_data_path+column_name+'_id.npy', col_id)
#     np.save(processed_data_path+column_name+'_attributed_history.npy', attributed_history)
#     np.save(processed_data_path+column_name+'_time_delta_history.npy', time_delta_history)
#     np.save(processed_data_path+column_name+'_next_download.npy', is_next_downloaded)
#     np.save(processed_data_path+column_name+'_history_length.npy', history_length)