In [None]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras

Load the necessary packages, especially the keras layers using hte layer names directly

In [28]:
# libraries
#import rpy2.robjects as robjects
import pandas as pd
import keras
import numpy as np
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam

In [29]:
# load the data
#path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
path = "/Users/maj/Dropbox/DSG17/DSG_2017/"
data   = pd.read_csv(path + "data/data_full.csv")

In [30]:
print(data.columns)

Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'context_type', 'media_duration', 'listen_type', 'user_gender',
       'user_age', 'is_listened', 'sample_id', 'dataset', 'time_lag',
       'session_id', 'song_session_position', 'first_flow', 'time_diff',
       'hours', 'genre_plays', 'genre_skips', 'artist_plays', 'artist_skips',
       'album_plays', 'album_skips', 'song_plays', 'song_skips',
       'user_ratio_flow', 'user_ratio_full', 'genre_ratio', 'artist_ratio',
       'song_ratio', 'platform_name1', 'platform_name2', 'platform_family1',
       'platform_family2'],
      dtype='object')


Separate the combined data into the training, test and unknown set

In [31]:
tr = data[data.dataset == 'train']
ts = data[data.dataset == 'test']
unknown = data[data.dataset == 'unknown']

In [32]:
# keep only Flow songs in the data
# this proves to predict better, but some information is lost
# !!! This currently messes up the ID assignment and needs to be corrected there.
#tr = tr.query("listen_type == 1")
print(tr.shape)
print(ts.shape)
print(tr.columns)

(7365595, 36)
(193239, 36)
Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'context_type', 'media_duration', 'listen_type', 'user_gender',
       'user_age', 'is_listened', 'sample_id', 'dataset', 'time_lag',
       'session_id', 'song_session_position', 'first_flow', 'time_diff',
       'hours', 'genre_plays', 'genre_skips', 'artist_plays', 'artist_skips',
       'album_plays', 'album_skips', 'song_plays', 'song_skips',
       'user_ratio_flow', 'user_ratio_full', 'genre_ratio', 'artist_ratio',
       'song_ratio', 'platform_name1', 'platform_name2', 'platform_family1',
       'platform_family2'],
      dtype='object')


In [33]:
tsu = ts.user_id.unique()
tru = tr.user_id.unique()
print([x for x in tsu if x not in tru])

[]


From the full feature set, create a subset of the data with the features that can be passed to the model directly. That is everything excluding the IDs and large factor variables.

In [34]:
# Create the data input matrix that can be passed to the keras model
# i.e. only numeric and without IDs and target variable
dropVars = ['dataset','user_id', 'artist_id', 'media_id', "genre_id", "album_id", "session_id", "is_listened", "sample_id"]
tr_data = tr[[column for column in tr.columns if column not in dropVars]].as_matrix()
ts_data = ts[[column for column in tr.columns if column not in dropVars]].as_matrix()
unknown_data = unknown[[column for column in tr.columns if column not in dropVars]].as_matrix()

The structure of the keras model is as follows:
- Input layers ot specify the size of the data that goes in the model
- (Embedding layer, i.e. lookup table layer that assigns 50 values to each level. These values are then trained to somehow capture the essence of this level.)
- Dense layer, i.e. fully connected neural net layers
- Output layer, i.e. Dense layer with only one result and sigmoid activation (for a result between 0 and 1)

In [46]:
# create an input layer with one row of IDs
user_in   = Input(shape = (1,), dtype='int64', name = "user_in")
#song_in   = Input(shape = (1,), dtype='int64', name = "song_in")
artist_in = Input(shape = (1,), dtype='int64', name = "artist_in")
genre_in  = Input(shape = (1,), dtype='int64', name = "genre_in")
context_in = Input(shape = (1,), dtype='int64',   name = "context_in")

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
n_users   = tr.user_id.nunique()
#n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_genres  = tr.genre_id.nunique()
n_context = tr.context_type.nunique()

# The layer needs the number of the input levels and the number of values for each level
user_embedding = Embedding(n_users,   50, input_length=1, embeddings_regularizer=l2(1e-5))(user_in)
#song_embedding = Embedding(n_songs,   50, input_length=1, embeddings_regularizer=l2(1e-5))(song_in)
artist_embedding = Embedding(n_artists, 20, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
genre_embedding = Embedding(n_genres,  20, input_length=1, embeddings_regularizer=l2(1e-5))(genre_in)
context_embedding = Embedding(n_context, 50, input_length=1, embeddings_regularizer=l2(1e-5))(context_in)


In [47]:
# Data input doesn't need any embedding and can directly be passed to a Dense layer
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

In [49]:

# Specify what to do with the layers
# We want to multiply them into a 'rating' matrix
#song_embedding,
embedding_input = concatenate([user_embedding,  artist_embedding, genre_embedding])
embedding_input = Flatten()(embedding_input)
embedding_dense = Dense(128, activation = "relu")(embedding_input)
#embedding_dense = Flatten()(embedding_dense)

#data_input = Flatten()(data_in)
data_dense = Dense(128, activation = "relu")(data_in)
data_dense = BatchNormalization()(data_dense)
# Make into a vector, i.e. drop 2D structure
# The 2D structure is important for e.g. CNN filters,
# but not necessary in a dense layer, I think
x = concatenate([embedding_dense, data_dense])
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
# Correct the standard devitation calculated from a batch
# to better fit the 'true' sd
x = BatchNormalization()(x)
# "Drop" each node at a training stage with a certain probability 
# then reinsert it after the training run
# Avoids overfitting and increases speed
x = Dropout(0.5)(x)
#x = Dropout(0.5)(Dense(128, activation='relu')(x))
#x = BatchNormalization()(x)
#x = Dense(64, activation='relu')(x) 
#x = Dropout(0.5)(x)
output = Dense(1, activation = "sigmoid")(x)
#x = merge([x, ub], mode = 'sum')
#x = merge([x, sb], mode = 'sum') # Can this be included in the line above?

# Then we specify the model that we want to use
#song_in,
model = Model([user_in,  artist_in, genre_in, data_in], output) # 
model.compile(Adam(0.001), loss="binary_crossentropy", metrics = ['accuracy'])

In [51]:
# run the estimations
#tr.media_id,
#ts.media_id,
model.fit([tr.user_id,  tr.artist_id, tr.genre_id, tr_data], tr.is_listened,  #
validation_data = ([ts.user_id,  ts.artist_id, ts.genre_id, ts_data], ts.is_listened),
batch_size = 22814, epochs = 1)

Train on 7365595 samples, validate on 193239 samples
Epoch 1/1
  45628/7365595 [..............................] - ETA: 558s - loss: 0.9659 - acc: 0.5096

IndexError: index 19588 is out of bounds for size 19588
Apply node that caused the error: AdvancedSubtensor1(flatten_4/embedding_9/embeddings, Elemwise{Cast{int32}}.0)
Toposort index: 76
Inputs types: [TensorType(float32, matrix), TensorType(int32, vector)]
Inputs shapes: [(19588, 20), (22814,)]
Inputs strides: [(80, 4), (4,)]
Inputs values: ['not shown', 'not shown']
Outputs clients: [[Reshape{3}(AdvancedSubtensor1.0, MakeVector{dtype='int64'}.0)]]

Backtrace when the node is created(use Theano flag traceback.limit=N to make it longer):
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2683, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2787, in run_ast_nodes
    if self.run_code(code, result):
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2847, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-46-6983e20fc25f>", line 20, in <module>
    artist_embedding = Embedding(n_artists, 20, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/keras/engine/topology.py", line 578, in __call__
    output = self.call(inputs, **kwargs)
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/keras/layers/embeddings.py", line 120, in call
    out = K.gather(self.embeddings, inputs)
  File "/Users/maj/anaconda/envs/deeplearning/lib/python3.6/site-packages/keras/backend/theano_backend.py", line 404, in gather
    y = reference[indices]

HINT: Use the Theano flag 'exception_verbosity=high' for a debugprint and storage map footprint of this apply node.

In [None]:
# predict on unlabelled set
pred = pd.DataFrame()
pred["sample_id"] = unknown.sample_id
pred["is_listened"] = model.predict([unknown.userIdx, unknown.songIdx, unknown.artistIdx, unknown.genreIdx])
pred.head(5)

In [None]:
# adding naive submission
naive = pd.read_csv(path + "submissions/naive_ratio_user.csv")
pred_mean = pred
pred_mean["is_listened"] = (pred["is_listened"] + naive["is_listened"])/2
pred_mean.head(5)

In [None]:
# saving submissions
pred.to_csv(path + "submissions/deep_128_64_flow.csv", index = False)
pred.to_csv(path + "submissions/deep_128_64_flow_plus_ratio_user.csv", index = False)