In [2]:
import pandas as pd
import numpy as np

from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.activations import relu

import csv
from csv import DictReader

import os

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
data_path = "../input/"
train = data_path+'train_sampled.csv'      
test = data_path+'test_sampled.csv'
submission = 'sub_proba.csv'

true_output = data_path+"output.csv"

corrupt_percentage = 0.10 # corrupt_percentage how much to take????
chunk_size = 5000


column_dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32'
        }
train_dim = 50000 
test_dim = 50000
D = 2 ** 26

In [None]:
## combining train and test data into csv
def combine(train, test, chunksize, output):
    with open(output, 'w') as r:
        r.write('ip,app,device,os,channel,click_time\n')
        for tc, tc1 in zip(pd.read_csv(train, chunksize=chunksize,
                                    usecols=["ip", "app", "device", "os", "channel", "click_time"], 
                                       dtype = column_dtypes),
                       pd.read_csv(test, chunksize= chunksize,
                                  usecols=["ip", "app", "device", "os", "channel", "click_time"], 
                                   dtype = column_dtypes)):
            tc.to_csv(r, chunksize=chunksize, index = False, header = False)
            tc1.to_csv(r, chunksize=chunksize, index = False, header = False)
combine(train, test, chunk_size, true_output)

In [26]:
def data(path, D, output_path):
    ''' GENERATOR: Apply hash-trick to the original csv row
                   and for simplicity, we one-hot-encode everything

        INPUT:
            path: path to training or testing file
            D: the max index that we can hash to

        YIELDS:
            ID: id of the instance, mainly useless
            x: a list of hashed and one-hot-encoded 'indices'
               we only need the index since all values are either 0 or 1
            y: y = 1 if we have a click, else we have y = 0
    '''
    with open(output_path, 'w') as outfile:
        outfile.write('ip,app,device,os,channel,date,hour\n')
        for t, row in enumerate(DictReader(open(path))):

            # process clicks
            y = 0.
            if 'is_attributed' in row:
                if row['is_attributed'] == '1':
                    y = 1.
                del row['is_attributed'], row['attributed_time']

            try:
                click_id = row['click_id']
            except:
                click_id = ''

            # process id
            x = []

            # Parse hour and date
            date, time = row['click_time'].split(' ')
            hour = time.split(':')[0]
            row['date'] = date
            row['hour'] = hour
            del row['click_time']

            # Add the rest of the features
            for k, v in row.items():
                x.append(abs(hash('%s_%s'%(k, v))) % D)
            outfile.write('%s,%s, %s, %s, %s, %s, %s\n' % (str(x[0]), str(x[1]), str(x[2]), str(x[3]), str(x[4]), str(x[5]), str(x[6])))

In [27]:
data(true_output, D, data_path+"train_hashed.csv")

In [1]:
## prepare corrupt data
def prepare_corrupt_data(path, corrupt_percentage, output_path, 
                         corrupt_cols = ["ip","app", "device", 'os','channel', 'date', "hour"]):
    ## load column
    corrupt_data = pd.DataFrame(data = None, columns= corrupt_cols)
    for col_name in corrupt_cols:
        print("reading ", col_name, "...........")
        col_data = pd.read_csv(path, usecols=[col_name], dtype=column_dtypes)
        corrupt_indices = np.random.choice(len(col_data), size=int(col_data.shape[0]*corrupt_percentage))
        corrupt_indices2 = np.random.choice(len(col_data), size=int(col_data.shape[0]*corrupt_percentage))
        corrupt_col = col_data.copy()
        for i,j in zip(corrupt_indices, corrupt_indices2):
            corrupt_col.loc[j, col_name] = col_data.loc[i][col_name].values
            corrupt_col.loc[i, col_name] = col_data.loc[j][col_name].values
        corrupt_data[col_name] = corrupt_col[col_name]
    corrupt_data.to_csv(output_path, index= False)

In [87]:
%%time
## preparing corrupt train data
prepare_corrupt_data(data_path+"train_hashed.csv", corrupt_percentage=0.15, output_path=data_path+"corrupt_hashed.csv")

## preparing corrupt test data
#prepare_corrupt_data(data_path+"test_hashed.csv", corrupt_percentage=0.15, output_path=corrupt_test)

reading  ip ...........
reading  app ...........
reading  device ...........
reading  os ...........
reading  channel ...........
reading  date ...........
reading  hour ...........
CPU times: user 1.67 s, sys: 156 ms, total: 1.83 s
Wall time: 1.84 s


In [82]:
from Encoding_in_chunks import OneHotEncoder

In [83]:
encoder2 = OneHotEncoder(categorical_columns= ["ip","app", "device","os", "channel", "date", "hour"])
chunked_data = pd.read_csv(data_path+"train_hashed.csv", chunksize = chunk_size)
encoder2.fit(chunked_data)

In [104]:
%%time
for i, d in enumerate(pd.read_csv(data_path + "train_hashed.csv", chunksize= 1)):
    X = encoder2.transform(d)
    np.save("../input/target_"+str(i)+".npy", X)
    
for i, d in enumerate(pd.read_csv(data_path + "corrupt_hashed.csv", chunksize= 1)):
    X = encoder2.transform(d)
    np.save("../input/target_"+str(i)+".npy", X)

CPU times: user 6min 18s, sys: 1min 6s, total: 7min 24s
Wall time: 7min 47s


In [91]:
from DataGenerator import DataGenerator
partition = {}
partition["train"] = list(np.arange(50000))

In [108]:
encoding_dim = 10000

# this is our input placeholder
input_layer = Input(shape=(input_dim,))

# "encoded" is the encoded representation of the input
encoded = Dense(encoding_dim, activation='relu')(input_layer)

# "decoded" is the lossy reconstruction of the input
decoded = Dense(input_dim, activation='relu')(encoded)

# this model maps an input to its reconstruction
autoencoder = Model(input_layer, decoded)

# this model maps an input to its encoded representation
encoder = Model(input_layer, encoded)

# create a placeholder for an encoded (32-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))

autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
params = {'dim': input_dim,
          'batch_size': 64,
          'n_classes': 0,
          'n_channels': 1,
          'shuffle': True}

input_dim = 17391
encoding_dim = 1000
training_generator = DataGenerator(partition['train'], labels = None, **params)

autoencoder.fit_generator(generator=training_generator,
                          steps_per_epoch = int((train_dim+test_dim)/params['batch_size']),
                          epochs = 1)

Epoch 1/1
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
(64, 17391) #######
   1/1562 [..............................] - ETA: 40:47:13 - loss: 4.9083e-04(64, 17391) #######
   2/1562 [..............................] - ETA: 26:57:57 - loss: 4.7931e-04(64, 17391) #######
   3/1562 [..............................] - ETA: 20:54:05 - loss: 4.6877e-04(64, 17391) #######
   4/1562 [..............................] - ETA: 16:43:20 - loss: 4.5699e-04(64, 17391) #######
   5/1562 [..............................] - ETA: 14:44:53 - loss: 4.4533e-04(64, 17391) #######
   6/1562 [..............................] - ETA: 12:55:21 - loss: 4.3372e-04(64, 17391) #######
   7/1562 [..............................] - ETA: 11:35:46 - loss: 4.2236e-04(64, 17391) #######
   8/1562 [..............................] - ETA: 10:40:33 - loss: 4.12