In [1]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras

Load the necessary packages, especially the keras layers using hte layer names directly

In [2]:
# libraries
#import rpy2.robjects as robjects

# Data
import pandas as pd
import numpy as np
# ML
from sklearn import preprocessing, metrics
# Keras
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam

Using Theano backend.


In [3]:
# load the data
#path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
path = "/Users/hauptjoh/Dropbox/DSG17/DSG_2017/"
data   = pd.read_csv(path + "data/data_full.csv")

In [4]:
print(data.columns)

Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'context_type', 'media_duration', 'listen_type', 'user_gender',
       'user_age', 'is_listened', 'sample_id', 'dataset', 'time_lag',
       'session_id', 'song_session_position', 'first_flow', 'time_diff',
       'release_year', 'genre_plays', 'genre_skips', 'artist_plays',
       'artist_skips', 'album_plays', 'album_skips', 'song_plays',
       'song_skips', 'user_ratio_flow', 'user_ratio_full', 'genre_ratio',
       'artist_ratio', 'song_ratio', 'platform_name0', 'platform_name1',
       'platform_name2', 'platform_family1', 'platform_family2',
       'hour_of_day2', 'hour_of_day3', 'hour_of_day4', 'hour_of_day5',
       'hour_of_day6', 'hour_of_day7', 'hour_of_day8', 'weekdayMon',
       'weekdaySat', 'weekdaySun', 'weekdayThu', 'weekdayTue', 'weekdayWed'],
      dtype='object')


Separate the combined data into the training, test and unknown set

In [5]:
tr = data[data.dataset == 'train']
# keep only Flow songs in the data
# this proves to predict better, but some information is lost
tr = tr.query("listen_type == 1")

ts = data[data.dataset == 'test']
known = data.query("dataset == 'train' or dataset == 'test'")
unknown = data[data.dataset == 'unknown']

In [6]:
print(tr.shape)
print(ts.shape)
print(tr.columns)

(2179592, 50)
(140019, 50)
Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'context_type', 'media_duration', 'listen_type', 'user_gender',
       'user_age', 'is_listened', 'sample_id', 'dataset', 'time_lag',
       'session_id', 'song_session_position', 'first_flow', 'time_diff',
       'release_year', 'genre_plays', 'genre_skips', 'artist_plays',
       'artist_skips', 'album_plays', 'album_skips', 'song_plays',
       'song_skips', 'user_ratio_flow', 'user_ratio_full', 'genre_ratio',
       'artist_ratio', 'song_ratio', 'platform_name0', 'platform_name1',
       'platform_name2', 'platform_family1', 'platform_family2',
       'hour_of_day2', 'hour_of_day3', 'hour_of_day4', 'hour_of_day5',
       'hour_of_day6', 'hour_of_day7', 'hour_of_day8', 'weekdayMon',
       'weekdaySat', 'weekdaySun', 'weekdayThu', 'weekdayTue', 'weekdayWed'],
      dtype='object')


In [7]:
# Check if IDs are valid, should not be smaller for the (2nd row) train set
print(data.user_id.nunique())
print(tr.user_id.nunique())
print(data.artist_id.nunique())
print(tr.artist_id.nunique())
print(data.genre_id.nunique())
print(tr.genre_id.nunique())
print(data.media_id.nunique())
print(tr.media_id.nunique())
print(data.context_type.nunique())
print(tr.context_type.nunique())

13811
13811
20190
20190
1493
1493
76267
76267
34
34


The maximum value should be equal to the number of unique values - 1 (Python indexing)

In [8]:
max(tr.media_id)

76266

From the full feature set, create a subset of the data with the features that can be passed to the model directly. That is everything excluding the IDs and large factor variables.

In [9]:
# Create the data input matrix that can be passed to the keras model
# i.e. only numeric and without IDs and target variable
dropVars = ['dataset','user_id', 'artist_id', 'media_id', "genre_id", "album_id", "session_id", "is_listened", "sample_id"]
tr_data = tr[[column for column in tr.columns if column not in dropVars]].as_matrix()
ts_data = ts[[column for column in ts.columns if column not in dropVars]].as_matrix()
known_data = known[[column for column in known.columns if column not in dropVars]].as_matrix()
unknown_data = unknown[[column for column in unknown.columns if column not in dropVars]].as_matrix()

The numeric data should be normalized to bring it to a similar range. This helps the network by making sure that the weights can also be in a similar, small range and do not need to take into account the different scales

In [10]:
# Normalize the data
# Create an object to transform the data to fit minmax processor
scaler = preprocessing.StandardScaler().fit(tr_data)

In [11]:
# Run the normalizer on the dataframe
tr_data = scaler.transform(tr_data)
ts_data = scaler.transform(ts_data)
known_data = scaler.transform(known_data)
unknown_data = scaler.transform(unknown_data)

The structure of the keras model is as follows:
- Input layers ot specify the size of the data that goes in the model
- (Embedding layer, i.e. lookup table layer that assigns 50 values to each level. These values are then trained to somehow capture the essence of this level.)
- Dense layer, i.e. fully connected neural net layers
- Output layer, i.e. Dense layer with only one result and sigmoid activation (for a result between 0 and 1)

In [12]:
# create an input layer with one row of IDs
user_in   = Input(shape = (1,), dtype='int64', name = "user_in")
song_in   = Input(shape = (1,), dtype='int64', name = "song_in")
artist_in = Input(shape = (1,), dtype='int64', name = "artist_in")
genre_in  = Input(shape = (1,), dtype='int64', name = "genre_in")
context_in = Input(shape = (1,), dtype='int64',   name = "context_in")

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
n_users   = tr.user_id.nunique()
n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_genres  = tr.genre_id.nunique()
n_context = tr.context_type.nunique()

# The layer needs the number of the input levels and the number of values for each level
user_embedding = Embedding(n_users,   50, input_length=1, embeddings_regularizer=l2(1e-5))(user_in)
song_embedding = Embedding(n_songs,   50, input_length=1, embeddings_regularizer=l2(1e-5))(song_in)
artist_embedding = Embedding(n_artists, 50, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
genre_embedding = Embedding(n_genres,  50, input_length=1, embeddings_regularizer=l2(1e-5))(genre_in)
context_embedding = Embedding(n_context, 50, input_length=1, embeddings_regularizer=l2(1e-5))(context_in)


In [13]:
# Data input doesn't need any embedding and can directly be passed to a Dense layer
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

In [14]:
# Specify what to do with the layers
embedding_input = concatenate([user_embedding,  song_embedding, artist_embedding, genre_embedding])
embedding_input = Flatten()(embedding_input)
embedding_dense = Dense(256, activation = "relu")(embedding_input)
embedding_dense = BatchNormalization()(embedding_dense)
#embedding_dense = Flatten()(embedding_dense)

#data_input = Flatten()(data_in)
data_dense = Dense(256, activation = "relu")(data_in)
data_dense = BatchNormalization()(data_dense)
# Make into a vector, i.e. drop 2D structure
# The 2D structure is important for e.g. CNN filters,
# but not necessary in a dense layer, I think
x = concatenate([embedding_dense, data_dense])
x = Dropout(0.5)(x)
x = Dense(256, activation='relu')(x)
# Correct the standard devitation calculated from a batch
# to better fit the 'true' sd
x = BatchNormalization()(x)
# "Drop" each node at a training stage with a certain probability 
# then reinsert it after the training run
# Avoids overfitting and increases speed
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x) 
x = Dropout(0.5)(x)
output = Dense(1, activation = "sigmoid")(x)
#x = merge([x, ub], mode = 'sum')
#x = merge([x, sb], mode = 'sum') # Can this be included in the line above?

# Then we specify the model that we want to use
#
model = Model([user_in, song_in, artist_in, genre_in, data_in], output) # 
model.compile(optimizer="Adagrad", loss="binary_crossentropy", metrics = ['accuracy'])

In [None]:
# run the estimations
model.fit([tr.user_id, tr.media_id, tr.artist_id, tr.genre_id, tr_data], tr.is_listened,  #
validation_data = ([ts.user_id, ts.media_id, ts.artist_id, ts.genre_id, ts_data], ts.is_listened),
batch_size = 22814, epochs = 10)

Train on 2179592 samples, validate on 140019 samples
Epoch 1/10
 365024/2179592 [====>.........................] - ETA: 341s - loss: 0.4455 - acc: 0.7868

In [16]:
# predict validation data
pred = pd.DataFrame()
pred["user_id"]  = ts.user_id
pred["media_id"] = ts.media_id
pred["is_listened"] = model.predict([ts.user_id, ts.media_id, ts.artist_id, ts.genre_id, ts_data])
pred.to_csv(path + "data/keras_ts_emd+data_epoch10+5.csv", index = False)
pred.head(5)
metrics.roc_auc_score(ts.is_listened, pred.is_listened)

0.69867476601425826

In [17]:
# run the estimations on full known data
model.fit([known.user_id, known.media_id, known.artist_id, known.genre_id, known_data], known.is_listened,
batch_size = int(known.shape[0]/100), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a4bc4160>

In [23]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = unknown.sample_id.astype('int')
pred["is_listened"] = model.predict([unknown.user_id, unknown.media_id, unknown.artist_id, unknown.genre_id, unknown_data])
pred.to_csv(path + "submissions/keras_uk_emb+data_epoch10+5_formatting.csv", index = False)
pred.head(5)

Unnamed: 0,sample_id,is_listened
6317,14561,0.968824
12558,6026,0.848141
18770,9627,0.989767
24352,6064,0.838861
29779,8065,0.17218
