In [2]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras

Collecting rpy2
  Downloading rpy2-2.8.5.tar.gz (184kB)
[K    100% |████████████████████████████████| 184kB 518kB/s 
Building wheels for collected packages: rpy2
  Running setup.py bdist_wheel for rpy2 ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
[?25h  Stored in directory: /Users/Kozodoi/Library/Caches/pip/wheels/23/9e/ee/0e5f6a00aafef9935d40ebf7657278220139f0101321e30d07
Successfully built rpy2
Installing collected packages: rpy2
Successfully installed rpy2-2.8.5


In [2]:
# libraries
#import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam

Using Theano backend.


In [12]:
# load the data
#path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
path = "/Users/maj/Dropbox/DSG17/DSG_2017/"
data   = pd.read_csv(path + "data/data_full.csv")

In [14]:
data.columns

Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'media_duration', 'listen_type', 'user_gender', 'user_age',
       'is_listened', 'sample_id', 'dataset', 'time_lag', 'session_id',
       'song_session_position', 'first_flow', 'time_diff', 'hours',
       'genre_plays', 'genre_skips', 'artist_plays', 'artist_skips',
       'album_plays', 'album_skips', 'song_plays', 'song_skips',
       'user_ratio_flow', 'user_ratio_full', 'genre_ratio', 'artist_ratio',
       'song_ratio', 'platform_name1', 'platform_name2', 'platform_family1',
       'platform_family2'],
      dtype='object')

In [15]:
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
unknown = data.query("dataset == 'unknown'")

In [16]:
# keep only Flow songs in the data
# this proves to predict better, but some information is lost
tr = tr.query("listen_type == 1")
print(tr.shape)
print(tr.columns)

(2275140, 35)
Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'media_duration', 'listen_type', 'user_gender', 'user_age',
       'is_listened', 'sample_id', 'dataset', 'time_lag', 'session_id',
       'song_session_position', 'first_flow', 'time_diff', 'hours',
       'genre_plays', 'genre_skips', 'artist_plays', 'artist_skips',
       'album_plays', 'album_skips', 'song_plays', 'song_skips',
       'user_ratio_flow', 'user_ratio_full', 'genre_ratio', 'artist_ratio',
       'song_ratio', 'platform_name1', 'platform_name2', 'platform_family1',
       'platform_family2'],
      dtype='object')


In [17]:
# Create the data input matrix that can be passed to the keras model
# i.e. only numeric and without IDs and target variable
dropVars = ['dataset','user_id', 'artist_id', 'media_id', "genre_id", "album_id", "session_id", "is_listened", "sample_id"]
tr_data = tr[[column for column in tr.columns if column not in dropVars]]
ts_data = ts[[column for column in tr.columns if column not in dropVars]]
unknown_data = unknown[[column for column in tr.columns if column not in dropVars]]

In [18]:
print(tr_data.shape)
print(tr_data.columns)
#tr_data = pd.get_dummies(tr_data)
tr_data.head()

(2275140, 26)
Index(['media_duration', 'listen_type', 'user_gender', 'user_age', 'time_lag',
       'song_session_position', 'first_flow', 'time_diff', 'hours',
       'genre_plays', 'genre_skips', 'artist_plays', 'artist_skips',
       'album_plays', 'album_skips', 'song_plays', 'song_skips',
       'user_ratio_flow', 'user_ratio_full', 'genre_ratio', 'artist_ratio',
       'song_ratio', 'platform_name1', 'platform_name2', 'platform_family1',
       'platform_family2'],
      dtype='object')


Unnamed: 0,media_duration,listen_type,user_gender,user_age,time_lag,song_session_position,first_flow,time_diff,hours,genre_plays,...,song_skips,user_ratio_flow,user_ratio_full,genre_ratio,artist_ratio,song_ratio,platform_name1,platform_name2,platform_family1,platform_family2
102,205,1,0,26,0.466667,34,1,4.0,15,2494456,...,379,0.983193,0.981766,0.978588,0.981766,0.981766,1,0,0,0
103,244,1,0,26,0.233333,35,0,2722.0,15,254,...,19,0.983193,0.981766,0.981766,0.981766,0.981766,1,0,0,0
104,237,1,0,26,0.166667,36,0,11.0,15,2494456,...,12,0.983193,0.981766,0.978588,0.981766,0.981766,1,0,0,0
105,244,1,0,26,0.4,37,0,1863.0,15,12213,...,415,0.983193,0.981766,0.981766,0.981766,0.981766,1,0,0,0
106,246,1,0,26,0.333333,38,0,11.0,15,2494456,...,649,0.983193,0.981766,0.978588,0.973684,0.981766,1,0,0,0


In [19]:
# create an input layer with one row of IDs
user_in   = Input(shape = (1,), dtype='int64', name = "user_in")
song_in   = Input(shape = (1,), dtype='int64', name = "song_in")
artist_in = Input(shape = (1,), dtype='int64', name = "artist_in")
genre_in  = Input(shape = (1,), dtype='int64', name = "genre_in")
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
n_users   = tr.user_id.nunique()
n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_genres  = tr.genre_id.nunique()
user_embedding = Embedding(n_users,   100, input_length=1, embeddings_regularizer=l2(1e-5))(user_in)
song_embedding = Embedding(n_songs,   50, input_length=1, embeddings_regularizer=l2(1e-5))(song_in)
artist_embedding = Embedding(n_artists, 20, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
genre_embedding = Embedding(n_genres,  20, input_length=1, embeddings_regularizer=l2(1e-5))(genre_in)

# Specify what to do with the layers
# We want to multiply them into a 'rating' matrix
#x = concatenate([u, s])
embedding_input = concatenate([user_embedding, song_embedding, artist_embedding, genre_embedding])
#embedding_input = Flatten()(embedding_input)
embedding_dense = Dense(128, activation = "relu")(embedding_input)
embedding_dense = Flatten()(embedding_dense)

#data_input = Flatten()(data_in)
data_dense = Dense(128, activation = "relu")(data_in)
# Make into a vector, i.e. drop 2D structure
# The 2D structure is important for e.g. CNN filters,
# but not necessary in a dense layer, I think
x = concatenate([embedding_dense, data_dense])
x = Dense(128, activation='relu')(x)
# Correct the standard devitation calculated from a batch
# to better fit the 'true' sd
x = BatchNormalization()(x)
# "Drop" each node at a training stage with a certain probability 
# then reinsert it after the training run
# Avoids overfitting and increases speed
x = Dropout(0.5)(x)
#x = Dropout(0.5)(Dense(128, activation='relu')(x))
#x = BatchNormalization()(x)
#x = Dense(64, activation='relu')(x) 
#x = Dropout(0.5)(x)
output = Dense(1, activation = "sigmoid")(x)
#x = merge([x, ub], mode = 'sum')
#x = merge([x, sb], mode = 'sum') # Can this be included in the line above?

# Then we specify the model that we want to use
model = Model([user_in, song_in, artist_in, genre_in, data_in], output) # 
model.compile(Adam(0.001), loss="binary_crossentropy", metrics = ['accuracy'])

In [20]:
# run the estimations
model.fit([tr.user_id, tr.media_id, tr.artist_id, tr.genre_id, tr_data], tr.is_listened,  #
validation_data = ([ts.user_id, ts.media_id, ts.artist_id, ts.genre_id, ts_data], ts.is_listened),
batch_size = 22814, epochs = 1)

Train on 2275140 samples, validate on 193239 samples
Epoch 1/1


IndexError: indices are out-of-bounds

In [13]:
# predict on unlabelled set
pred = pd.DataFrame()
pred["sample_id"] = unknown.sample_id
pred["is_listened"] = model.predict([unknown.userIdx, unknown.songIdx, unknown.artistIdx, unknown.genreIdx])
pred.head(5)

Unnamed: 0,sample_id,is_listened
0,0,0.981744
1,1,0.783598
2,2,0.759315
3,3,0.388151
4,4,0.930634


In [14]:
# adding naive submission
naive = pd.read_csv(path + "submissions/naive_ratio_user.csv")
pred_mean = pred
pred_mean["is_listened"] = (pred["is_listened"] + naive["is_listened"])/2
pred_mean.head(5)

Unnamed: 0,sample_id,is_listened
0,0,0.990872
1,1,0.778805
2,2,0.819416
3,3,0.496504
4,4,0.911219


In [15]:
# saving submissions
pred.to_csv(path + "submissions/deep_128_64_flow.csv", index = False)
pred.to_csv(path + "submissions/deep_128_64_flow_plus_ratio_user.csv", index = False)