In [2]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras

Collecting rpy2
  Downloading rpy2-2.8.5.tar.gz (184kB)
[K    100% |████████████████████████████████| 184kB 518kB/s 
Building wheels for collected packages: rpy2
  Running setup.py bdist_wheel for rpy2 ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | done
[?25h  Stored in directory: /Users/Kozodoi/Library/Caches/pip/wheels/23/9e/ee/0e5f6a00aafef9935d40ebf7657278220139f0101321e30d07
Successfully built rpy2
Installing collected packages: rpy2
Successfully installed rpy2-2.8.5


In [1]:
# libraries
import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
known = pd.read_csv(path + "data/train.csv")
unknown = pd.read_csv(path + "data/test.csv")

In [3]:
# keep only Flow songs
known = known.query("listen_type == 1")
known.shape

(2319611, 15)

In [6]:
# Prepare the data
# Create a placeholder for the IDs new in the test data
newUsers   = list(unknown.user_id[~unknown.user_id.isin(known.user_id)])       + list(known.user_id.value_counts().keys()[known.user_id.value_counts() == 1])
newSongs   = list(unknown.media_id[~unknown.media_id.isin(known.media_id)])    + list(known.media_id.value_counts().keys()[known.media_id.value_counts() == 1])[-1000:]
newAlbums  = list(unknown.album_id[~unknown.album_id.isin(known.album_id)])    + list(known.album_id.value_counts().keys()[known.album_id.value_counts() == 1])[-1000:]
newArtists = list(unknown.artist_id[~unknown.artist_id.isin(known.artist_id)]) + list(known.artist_id.value_counts().keys()[known.artist_id.value_counts() == 1])[-1000:]
newGenres  = list(unknown.genre_id[~unknown.genre_id.isin(known.genre_id)])    + list(known.genre_id.value_counts().keys()[known.genre_id.value_counts() == 1])[-1000:]

# In particular, assign IDs from 0 to N for users and songs
# Use enumerate() to create a list of the new and original IDs
users   = list(enumerate([i for i in known.user_id.unique()   if i not in newUsers]))
songs   = list(enumerate([i for i in known.media_id.unique()  if i not in newSongs]))
albums  = list(enumerate([i for i in known.album_id.unique()  if i not in newAlbums]))
artists = list(enumerate([i for i in known.artist_id.unique() if i not in newArtists]))
genres  = list(enumerate([i for i in known.genre_id.unique()  if i not in newGenres]))

# Create a dictionary with old IDs to new IDs
userid2idx   = {o:i for i,o in users}
songid2idx   = {o:i for i,o in songs}
albumid2idx = {o:i for i,o in albums}
artistid2idx = {o:i for i,o in artists}
genreid2idx  = {o:i for i,o in genres}

# Update with new/rare entries
userid2idx.update({o:(max(userid2idx.values())+1) for o in newUsers})
songid2idx.update({o:(max(songid2idx.values())+1) for o in newSongs})
albumid2idx.update({o:(max(albumid2idx.values())+1) for o in newAlbums})
artistid2idx.update({o:(max(artistid2idx.values())+1) for o in newArtists})
genreid2idx.update({o:(max(genreid2idx.values())+1) for o in newGenres})

# Create id variable with the new IDs
known['userIdx']   = known.user_id.apply(lambda x: userid2idx[x])
known['songIdx']   = known.media_id.apply(lambda x: songid2idx[x])
known['albumIdx']  = known.album_id.apply(lambda x: albumid2idx[x])
known['artistIdx'] = known.artist_id.apply(lambda x: artistid2idx[x])
known['genreIdx']  = known.genre_id.apply(lambda x: genreid2idx[x])
unknown['userIdx']   = unknown.user_id.apply(lambda x: userid2idx[x])
unknown['songIdx']   = unknown.media_id.apply(lambda x: songid2idx[x])
unknown['albumIdx']   = unknown.album_id.apply(lambda x: albumid2idx[x])
unknown['artistIdx'] = unknown.artist_id.apply(lambda x: artistid2idx[x])
unknown['genreIdx']  = unknown.genre_id.apply(lambda x: genreid2idx[x])

In [11]:
# partition train/test data
ts = known.groupby(["userIdx"]).tail(3) # last 3 observations by user
tr = known.groupby(["userIdx"], group_keys=False).apply(lambda x: x[:-3])

# move songs that appear only in ts to tr
strayObs = ts.songIdx.isin(tr.songIdx) & ts.userIdx.isin(tr.userIdx) & ts.albumIdx.isin(tr.albumIdx) & ts.artistIdx.isin(tr.artistIdx) & ts.genreIdx.isin(tr.genreIdx)
tr = tr.append(ts[~strayObs])
ts = ts[strayObs]

In [12]:
# create an input layer with one row of IDs
user_in   = Input(shape = (1,), dtype='int64', name = "user_in")
song_in   = Input(shape = (1,), dtype='int64', name = "song_in")
album_in  = Input(shape = (1,), dtype='int64', name = "album_in")
artist_in = Input(shape = (1,), dtype='int64', name = "artist_in")
genre_in  = Input(shape = (1,), dtype='int64', name = "genre_in")

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
n_users   = tr.userIdx.nunique()
n_songs   = tr.songIdx.nunique()
n_albums  = tr.albumIdx.nunique()
n_artists = tr.artistIdx.nunique()
n_genres  = tr.genreIdx.nunique()
u = Embedding(n_users,   50, input_length=1, embeddings_regularizer=l2(1e-5))(user_in)
s = Embedding(n_songs,   50, input_length=1, embeddings_regularizer=l2(1e-5))(song_in)
l = Embedding(n_albums,  50, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
a = Embedding(n_artists, 50, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
g = Embedding(n_genres,  50, input_length=1, embeddings_regularizer=l2(1e-5))(genre_in)

# Specify what to do with the layers
# We want to multiply them into a 'rating' matrix
x = concatenate([u, s, l, a, g])
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dropout(0.5)(Dense(128, activation='relu')(x))
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x) 
x = Dropout(0.5)(x)
x = Dense(1, activation = "sigmoid")(x)
#x = merge([x, ub], mode = 'sum')
#x = merge([x, sb], mode = 'sum') # Can this be included in the line above?

# Then we specify the model that we want to use
model = Model([user_in, song_in, album_in, artist_in, genre_in], x) # 
model.compile(Adam(0.001), loss="binary_crossentropy", metrics = ['accuracy'])

In [None]:
# run the estimations
model.fit([tr.userIdx, tr.songIdx, tr.albumIdx, tr.artistIdx, tr.genreIdx], tr.is_listened,  #
validation_data = ([ts.userIdx, ts.songIdx, ts.albumIdx, ts.artistIdx, ts.genreIdx], ts.is_listened),
batch_size = 22813, epochs = 300)

Train on 2281342 samples, validate on 38269 samples
Epoch 1/300

In [None]:
# predict on class set
pred = pd.DataFrame()
pred["sample_id"] = unknown.sample_id
pred["is_listened"] = model.predict([unknown.userIdx, unknown.songIdx, unknown.albumIdx unknown.artistIdx, unknown.genreIdx])
pred.head(5)

In [None]:
# saving as CSV
pred.to_csv(path + "submissions/deep_nikita_3.csv", index = False)