In [33]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras
#!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced-learn-0.2.1.tar.gz (90kB)
[K    100% |████████████████████████████████| 92kB 660kB/s 
Building wheels for collected packages: imbalanced-learn
  Running setup.py bdist_wheel for imbalanced-learn ... [?25l- \ | / done
[?25h  Stored in directory: /Users/Kozodoi/Library/Caches/pip/wheels/b8/20/bd/0b775f7e5d413ac72562b1a5126598bcb6e0eae10da659be9f
Successfully built imbalanced-learn
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.2.1 imblearn-0.0


In [65]:
# libraries
import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Reshape
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
from imblearn.over_sampling import SMOTE

## 1. DATA PREPARATION

In [66]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
known   = pd.read_csv(path + "data/train.csv")
unknown = pd.read_csv(path + "data/test.csv")

In [67]:
# keep only Flow songs in the data
# this proves to predict better, but some information is lost
known = known.query("listen_type == 1")
known.shape

(2319611, 15)

In [68]:
# limiting the maximal number of songs to 100 - works bad
#known = known.groupby(["user_id"]).tail(100)
#known.shape

(805378, 15)

In [69]:
# Prepare the data
# Create a placeholder for the IDs new in the test data
newUsers   = list(unknown.user_id[~unknown.user_id.isin(known.user_id)])       + list(known.user_id.value_counts().keys()[known.user_id.value_counts()     == 1])
newSongs   = list(unknown.media_id[~unknown.media_id.isin(known.media_id)])    + list(known.media_id.value_counts().keys()[known.media_id.value_counts()   == 1])[-1000:]
newArtists = list(unknown.artist_id[~unknown.artist_id.isin(known.artist_id)]) + list(known.artist_id.value_counts().keys()[known.artist_id.value_counts() == 1])[-1000:]
newGenres  = list(unknown.genre_id[~unknown.genre_id.isin(known.genre_id)])    + list(known.genre_id.value_counts().keys()[known.genre_id.value_counts()   == 1])[-1000:]
newAlbums  = list(unknown.album_id[~unknown.album_id.isin(known.album_id)])    + list(known.album_id.value_counts().keys()[known.album_id.value_counts() == 1])[-1000:]
newContext = list(unknown.context_type[~unknown.context_type.isin(known.context_type)]) + list(known.context_type.value_counts().keys()[known.context_type.value_counts() == 1])[-1000:]

# In particular, assign IDs from 0 to N for users and songs
# Use enumerate() to create a list of the new and original IDs
users   = list(enumerate([i for i in known.user_id.unique()   if i not in newUsers]))
songs   = list(enumerate([i for i in known.media_id.unique()  if i not in newSongs]))
artists = list(enumerate([i for i in known.artist_id.unique() if i not in newArtists]))
genres  = list(enumerate([i for i in known.genre_id.unique()  if i not in newGenres]))
albums  = list(enumerate([i for i in known.album_id.unique()  if i not in newAlbums]))
context = list(enumerate([i for i in known.context_type.unique() if i not in newContext]))

# Create a dictionary with old IDs to new IDs
userid2idx    = {o:i for i,o in users}
songid2idx    = {o:i for i,o in songs}
artistid2idx  = {o:i for i,o in artists}
genreid2idx   = {o:i for i,o in genres}
albumid2idx   = {o:i for i,o in albums}
contextid2idx = {o:i for i,o in context}

# Update with new/rare entries
userid2idx.update({o:(max(userid2idx.values())+1)       for o in newUsers})
songid2idx.update({o:(max(songid2idx.values())+1)       for o in newSongs})
artistid2idx.update({o:(max(artistid2idx.values())+1)   for o in newArtists})
genreid2idx.update({o:(max(genreid2idx.values())+1)     for o in newGenres})
albumid2idx.update({o:(max(albumid2idx.values())+1)     for o in newAlbums})
contextid2idx.update({o:(max(contextid2idx.values())+1) for o in newContext})

# Create id variable with the new IDs (known)
known['userIdx']    = known.user_id.apply(lambda x:       userid2idx[x])
known['songIdx']    = known.media_id.apply(lambda x:      songid2idx[x])
known['artistIdx']  = known.artist_id.apply(lambda x:     artistid2idx[x])
known['genreIdx']   = known.genre_id.apply(lambda x:      genreid2idx[x])
known['albumIdx']   = known.album_id.apply(lambda x:      albumid2idx[x])
known['contextIdx'] = known.context_type.apply(lambda x:  contextid2idx[x])

# Create id variable with the new IDs (unknown)
unknown['userIdx']    = unknown.user_id.apply(lambda x:      userid2idx[x])
unknown['songIdx']    = unknown.media_id.apply(lambda x:     songid2idx[x])
unknown['artistIdx']  = unknown.artist_id.apply(lambda x:    artistid2idx[x])
unknown['genreIdx']   = unknown.genre_id.apply(lambda x:     genreid2idx[x])
unknown['albumIdx']   = unknown.album_id.apply(lambda x:     albumid2idx[x])
unknown['contextIdx'] = unknown.context_type.apply(lambda x: contextid2idx[x])

In [70]:
# partition train/test data: last 3 songs per user go to validation (stage 1)
# model predicts better if trained on full known sample without validation (stage 2)
ts = known.groupby(["userIdx"]).tail(3)
tr = known.groupby(["userIdx"], group_keys=False).apply(lambda x: x[:-3])

# move songs that appear only in ts to tr
strayObs = ts.songIdx.isin(tr.songIdx) & ts.userIdx.isin(tr.userIdx) & ts.artistIdx.isin(tr.artistIdx) & ts.genreIdx.isin(tr.genreIdx) & ts.albumIdx.isin(tr.albumIdx) & ts.contextIdx.isin(tr.contextIdx)
tr = tr.append(ts[~strayObs])
ts = ts[strayObs]

In [53]:
# saving the data samples
tr.to_csv(path + "data/tr_100.csv", index = False)
ts.to_csv(path + "data/ts_100.csv", index = False)

## 2. MODELING

### 2.1. INITIALIZING

In [71]:
# create an input layer with one row of IDs
user_in    = Input(shape = (1,), dtype = 'int64',   name = "user_in")
song_in    = Input(shape = (1,), dtype = 'int64',   name = "song_in")
artist_in  = Input(shape = (1,), dtype = 'int64',   name = "artist_in")
genre_in   = Input(shape = (1,), dtype = 'int64',   name = "genre_in")
album_in   = Input(shape = (1,), dtype = 'int64',   name = "album_in")
context_in = Input(shape = (1,), dtype = 'int64',   name = "context_in")
num_in     = Input(shape = (1,), dtype = 'float32', name = "num_in")

# Reshaping numeric features
n = Reshape((1,1))(num_in)

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
n_users   = tr.userIdx.nunique()
n_songs   = tr.songIdx.nunique()
n_artists = tr.artistIdx.nunique()
n_genres  = tr.genreIdx.nunique()
n_albums  = tr.albumIdx.nunique()
n_context = tr.contextIdx.nunique()

# Embeddings creation
u = Embedding(n_users,   50, input_length=1, embeddings_regularizer=l2(1e-5))(user_in)
s = Embedding(n_songs,   50, input_length=1, embeddings_regularizer=l2(1e-5))(song_in)
a = Embedding(n_artists, 50, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
g = Embedding(n_genres,  50, input_length=1, embeddings_regularizer=l2(1e-5))(genre_in)
l = Embedding(n_albums,  50, input_length=1, embeddings_regularizer=l2(1e-5))(album_in)
c = Embedding(n_context, 50, input_length=1, embeddings_regularizer=l2(1e-5))(context_in)

# Specify what to do with the layers
#x = concatenate([u, s, a, g, l, c, n]) # with one numeric feature
x = concatenate([u, s, a, g, c])        # without numeric features
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x) 
x = Dropout(0.5)(x)
x = Dense(1, activation = "sigmoid")(x)

# Then we specify the model that we want to use
#model = Model([user_in, song_in, artist_in, genre_in, album_in, context_in, num_in], x) # with one numeric feature
model = Model([user_in, song_in, artist_in, genre_in, context_in], x)                    # without numeric features
model.compile(Adam(0.001), loss = "binary_crossentropy", metrics = ['accuracy'])

### 2.2. FIRST STAGE

In [61]:
# run the estimations on training data
model.fit([tr.userIdx, tr.songIdx, tr.artistIdx, tr.genreIdx, tr.contextIdx], tr.is_listened, 
validation_data = ([ts.userIdx, ts.songIdx, ts.artistIdx, ts.genreIdx, ts.contextIdx], ts.is_listened),
batch_size = int(len(tr)/100), epochs = 10)

Train on 767272 samples, validate on 38106 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x122e580f0>

In [64]:
# predict validation data
pred = pd.DataFrame()
pred["user_id"]  = ts.user_id
pred["media_id"] = ts.media_id
pred["is_listened"] = model.predict([ts.userIdx, ts.songIdx, ts.artistIdx, ts.genreIdx, ts.contextIdx])
pred.to_csv(path + "data/deep_128_64_flow_cont.csv", index = False)
pred.head(5)

### 2.3. SECOND STAGE

In [72]:
# run the estimations on full known data
model.fit([known.userIdx, known.songIdx, known.artistIdx, known.genreIdx, known.contextIdx], known.is_listened,
batch_size = int(known.shape[0]/100), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x13ebb1f98>

In [73]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = unknown.sample_id
pred["is_listened"] = model.predict([unknown.userIdx, unknown.songIdx, unknown.artistIdx, unknown.genreIdx, unknown.contextIdx])
pred.to_csv(path + "submissions/deep_128_64_flow_cont.csv", index = False)
pred.head(5)

Unnamed: 0,sample_id,is_listened
0,0,0.999752
1,1,0.295713
2,2,0.49112
3,3,0.6316
4,4,0.814451


# 3. ENSEMBLING

In [74]:
# loading naive submission
naive = pd.read_csv(path + "submissions/naive_ratio_user.csv")

In [75]:
# combining predictions
pred_mean = pd.DataFrame()
pred_mean["sample_id"] = unknown.sample_id
pred_mean["is_listened"] = 0.5*pred["is_listened"] + 0.5*naive["is_listened"]
pred_mean.to_csv(path + "submissions/deep_128_64_flow_cont_plus_ratio_user_0_5.csv", index = False)
pred_mean.head(5)

Unnamed: 0,sample_id,is_listened
0,0,0.999876
1,1,0.534862
2,2,0.685319
3,3,0.618228
4,4,0.853127
