In [2]:
import pandas as pd
import numpy as np
import time
import multiprocessing as mp
import keras

Using TensorFlow backend.


# Methods

In [19]:
# Map
## User ID
def map_user_id( x ):
    return( user_id_dict[x] )
## Notif ID
def map_notif_id( x ):
    return( notif_id_dict[x] )

# Parallelize
def parallelize( _func, _iterable, n_proc = 4 ):
    mp_pool = mp.Pool( n_proc )
    result = mp_pool.map( _func, _iterable )
    mp_pool.close()
    mp_pool.terminate()
    return( result )

# Load Data

In [3]:
_start = time.time()
user_notif_id = pd.read_csv(
    'train_interactions.csv',
    usecols=[0,1],
    dtype={
        'user_id':np.uint32,
        'notif_id':np.uint32,
    })
_end = time.time()
duration = _end - _start
print('Train Interactions: {} s'.format( duration ))

Train Interactions: 40.24423098564148 s


In [4]:
_start = time.time()
targets = pd.read_csv('train_interactions.csv',usecols=[2],dtype={'interaction':np.bool})
_end = time.time()
duration = _end - _start
print('Targets: {} s'.format( duration ))

Targets: 25.84952712059021 s


# Map IDs

## User IDs

In [5]:
_start = time.time()
user_ids = pd.read_csv( 'autoencode_user_id.csv', header=None, dtype={0:np.uint32} )
_end   = time.time()
duration = _end - _start
print('Auto Shit fuck! {} s'.format( duration ))

Auto Shit fuck! 0.6128528118133545 s


In [6]:
user_id_dict = dict(zip( user_ids[0], user_ids.index ))

In [7]:
_start = time.time()
user_notif_id['user_id_row'] = parallelize( map_user_id, user_notif_id.user_id )
_end   = time.time()
duration = _end - _start
print('Map User IDs: {} s'.format( duration ))

Map User IDs: 443.5064949989319 s


## Notif IDs

In [17]:
notif_ids = pd.read_csv(
    'notifs_corrected.csv',
    usecols=[0,1,2,4],
    dtype={
        'notif_id': np.uint32,
        'day_of_week': np.uint8,
        'hour': np.uint8,
        'category': np.uint8
    }
)
notif_id_dict = dict(zip( notif_ids.notif_id, notif_ids.index ))

In [20]:
_start = time.time()
user_notif_id['notif_id_row'] = parallelize( map_notif_id, user_notif_id.notif_id )
_end   = time.time()
duration = _end - _start
print('Map Notif IDs: {} s'.format( duration ))

Map Notif IDs: 370.8269546031952 s


# Store

In [22]:
user_notif_id[['user_id_row','notif_id_row']].to_csv('user_notif_row_train.csv',index=False)

In [26]:
np.savez_compressed( 'targets_train.npz', y = targets.values[:,0] )

In [27]:
np.savez_compressed( 'train.npz',
                    user_id_rows = user_notif_id.user_id_row.values,
                    notif_id_rows = user_notif_id.notif_id_row.values,
                    targets = targets.values[:,0]
                   )

# Data Generator

# OnehotEncoding the notif features

In [14]:
notifs_df = pd.read_csv("notifs_corrected.csv",usecols=[0,1,2,4])

In [37]:
notifs_cat = notifs_df.drop(['notif_id'],axis = 1)

In [39]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()

In [41]:
notif_encodede_features = enc.fit_transform(notifs_cat)

In [59]:
notif_encodede_features.todense().shape

(6347, 37)

In [56]:
notif_encoded = notif_encodede_features.todense()

In [60]:
notif_encoded.shape

(6347, 37)

In [61]:
np.savez_compressed( "notifs_encoded.npz", X = notif_encoded )

# Builiding a Model

# Load Data

In [None]:
X_user_sparse = np.load('users_cat_autoencoded.npz')['X']
X_user_dense = np.load('users_numeric_encoded.npz')['X']
X_notif = notif_encodede_features.todense()
# Indices
_indices_file = np.load('train.npz')
indices_train_user = _indices_file['user_id_rows']
indices_train_notif = _indices_file['notif_id_rows']
targets = _indices_file['targets']
# Parameters
n_features = X_user_sparse.shape[1] + X_user_dense.shape[1] + X_notif.shape[1]
n_observations = len(indices_train_user)
weights_alpha = np.mean( targets )

In [13]:
# Encoder Data Generator
class TrainDataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(
            self,
            indices_train_user,
            indices_train_notif,
            targets,
            X_user_dense, X_user_sparse, X_notif,
            batch_size = 32,
            shuffle = True,
            weights_alpha = 0.5
        ):
        """Initialization
        """
        # Train Parameters
        self.indices_train_user, self.indices_train_notif = indices_train_user, indices_train_notif
        self.X_user_dense, self.X_user_sparse, self.X_notif = X_user_dense, X_user_sparse, X_notif
        self.targets = targets
        # Generation Parameters
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.n = len(indices_train_user)
        self.indices = np.arange( len(self.indices_train_user) )
        # Shuffle
        self.on_epoch_end()
    def __len__(self):
        'Denotes the number of batches per epoch'
        return self.n // self.batch_size

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indices of the batch
        indices = self.indices[ index*self.batch_size : (index+1)*self.batch_size ]
        # X
        ## Indices
        indices_user = self.indices_train_user[ indices ]
        indices_notif = self.indices_train_notif[ indices ]
        ## Horizontal Stack
        _X = np.hstack([
            self.X_user_dense[indices_user],
            self.X_user_sparse[indices_user].todense(),
            self.X_notif[ indices_notif ]
        ], dtype = np.float)
        # _y
        _y = self.targets[ indices ]
        # weights
        weights = _y * ( 1 - 2*weights_alpha ) + weights_alpha
        return _X, _y, weights

    def on_epoch_end(self):
        'Updates indices after each epoch'
        if self.shuffle :
            np.random.shuffle(self.indices)

In [10]:
from sklearn.model_selection import train_test_split

indices_train, indices_validation = train_test_split(
    np.arange(len(indices_train_user)),
    test_size = 0.2,
    stratify = targets
)

In [None]:
# Train Generator
generator_train = TrainDataGenerator(
    indices_train_user[indices_train],
    indices_train_notif[indices_train],
    targets[indices_train],
    X_user_dense,
    X_user_sparse,
    X_notif,
    batch_size = 2**10
)
# Validation Generator
generator_validation = TrainDataGenerator(
    indices_train_user[indices_validation],
    indices_train_notif[indices_validation],
    targets[indices_validation],
    X_user_dense,
    X_user_sparse,
    X_notif,
    batch_size = 2**10
)

# Model and Training

In [48]:
from keras.models import Sequential,Model
from keras.layers import Dense, Activation, Dropout

In [50]:
model = Sequential()
# first layer
model.add(Dense(64,input_dim = n_features))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# second layer
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.5))
# the final layer
model.add(Dense(1))
model.add(Activation('softmax'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])