In [33]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras
#!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced-learn-0.2.1.tar.gz (90kB)
[K    100% |████████████████████████████████| 92kB 660kB/s 
Building wheels for collected packages: imbalanced-learn
  Running setup.py bdist_wheel for imbalanced-learn ... [?25l- \ | / done
[?25h  Stored in directory: /Users/Kozodoi/Library/Caches/pip/wheels/b8/20/bd/0b775f7e5d413ac72562b1a5126598bcb6e0eae10da659be9f
Successfully built imbalanced-learn
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.2.1 imblearn-0.0


In [1]:
# libraries
import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Reshape
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam

Using TensorFlow backend.


## 1. DATA PREPARATION

In [9]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
known   = pd.read_csv(path + "data/train.csv")
unknown = pd.read_csv(path + "data/test.csv")

In [10]:
# keep only Flow songs in the data
# this proves to predict better, but some information is lost
known = known.query("listen_type == 1")
known.shape

(2319611, 15)

In [11]:
# adding observation index
known["row_index"] = known.index

In [12]:
# Prepare the data
# Create a placeholder for the IDs new in the test data
newUsers   = list(unknown.user_id[~unknown.user_id.isin(known.user_id)])       + list(known.user_id.value_counts().keys()[known.user_id.value_counts()     == 1])
newSongs   = list(unknown.media_id[~unknown.media_id.isin(known.media_id)])    + list(known.media_id.value_counts().keys()[known.media_id.value_counts()   == 1])[-1000:]
newArtists = list(unknown.artist_id[~unknown.artist_id.isin(known.artist_id)]) + list(known.artist_id.value_counts().keys()[known.artist_id.value_counts() == 1])[-1000:]
newContext = list(unknown.context_type[~unknown.context_type.isin(known.context_type)]) + list(known.context_type.value_counts().keys()[known.context_type.value_counts() == 1])[-1000:]

# In particular, assign IDs from 0 to N for users and songs
# Use enumerate() to create a list of the new and original IDs
users   = list(enumerate([i for i in known.user_id.unique()   if i not in newUsers]))
songs   = list(enumerate([i for i in known.media_id.unique()  if i not in newSongs]))
artists = list(enumerate([i for i in known.artist_id.unique() if i not in newArtists]))
context = list(enumerate([i for i in known.context_type.unique() if i not in newContext]))

# Create a dictionary with old IDs to new IDs
userid2idx    = {o:i for i,o in users}
songid2idx    = {o:i for i,o in songs}
artistid2idx  = {o:i for i,o in artists}
contextid2idx = {o:i for i,o in context}

# Update with new/rare entries
userid2idx.update({o:(max(userid2idx.values())+1)       for o in newUsers})
songid2idx.update({o:(max(songid2idx.values())+1)       for o in newSongs})
artistid2idx.update({o:(max(artistid2idx.values())+1)   for o in newArtists})
contextid2idx.update({o:(max(contextid2idx.values())+1) for o in newContext})

# Create id variable with the new IDs (known)
known['userIdx']    = known.user_id.apply(lambda x:       userid2idx[x])
known['songIdx']    = known.media_id.apply(lambda x:      songid2idx[x])
known['artistIdx']  = known.artist_id.apply(lambda x:     artistid2idx[x])
known['contextIdx'] = known.context_type.apply(lambda x:  contextid2idx[x])

# Create id variable with the new IDs (unknown)
unknown['userIdx']    = unknown.user_id.apply(lambda x:      userid2idx[x])
unknown['songIdx']    = unknown.media_id.apply(lambda x:     songid2idx[x])
unknown['artistIdx']  = unknown.artist_id.apply(lambda x:    artistid2idx[x])
unknown['contextIdx'] = unknown.context_type.apply(lambda x: contextid2idx[x])

In [13]:
# partition train/test data: last 3 songs per user go to validation (stage 1)
# model predicts better if trained on full known sample without validation (stage 2)
ts = known.groupby(["userIdx"]).tail(3)
tr = known.groupby(["userIdx"], group_keys=False).apply(lambda x: x[:-3])

# move songs that appear only in ts to tr
strayObs = ts.songIdx.isin(tr.songIdx) & ts.userIdx.isin(tr.userIdx) & ts.artistIdx.isin(tr.artistIdx) & ts.contextIdx.isin(tr.contextIdx)
tr = tr.append(ts[~strayObs])
ts = ts[strayObs]

In [7]:
# saving the data samples
#tr.to_csv(path + "data/tr.csv", index = False)
#ts.to_csv(path + "data/ts.csv", index = False)

## 2. MODELING

### 2.1. INITIALIZING

In [14]:
# Create an input layer with one row of IDs
user_in    = Input(shape = (1,), dtype = 'int64',   name = "user_in")
song_in    = Input(shape = (1,), dtype = 'int64',   name = "song_in")
artist_in  = Input(shape = (1,), dtype = 'int64',   name = "artist_in")
context_in = Input(shape = (1,), dtype = 'int64',   name = "context_in")
num_in     = Input(shape = (1,), dtype = 'float32', name = "num_in")

# Reshaping numeric features
n = Reshape((1,1))(num_in)

# Create an embedding assigning k latent factors to each ID
# These will be optimized
# A regulariztaion is added to avoid very large weights
n_users   = tr.userIdx.nunique()
n_songs   = tr.songIdx.nunique()
n_artists = tr.artistIdx.nunique()
n_context = tr.contextIdx.nunique()

# Embeddings creation
u = Embedding(n_users,   100, input_length=1, embeddings_regularizer=l2(1e-5))(user_in)
s = Embedding(n_songs,   100, input_length=1, embeddings_regularizer=l2(1e-5))(song_in)
a = Embedding(n_artists, 100, input_length=1, embeddings_regularizer=l2(1e-5))(artist_in)
c = Embedding(n_context, 100, input_length=1, embeddings_regularizer=l2(1e-5))(context_in)

# Specify what to do with the layers
x = concatenate([u, s])
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x) 
x = Dropout(0.5)(x)
x = Dense(1, activation = "sigmoid")(x)

# Then we specify the model that we want to use
model = Model([user_in, song_in], x) 
model.compile(optimizer="Adagrad", loss="binary_crossentropy", metrics = ['accuracy'])

### 2.2. FIRST STAGE

In [9]:
# run the estimations on training data
model.fit([tr.userIdx, tr.songIdx], tr.is_listened, 
validation_data = ([ts.userIdx, ts.songIdx], ts.is_listened),
batch_size = int(len(tr)/100), epochs = 20)

Train on 2281332 samples, validate on 38279 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x123f7ad68>

In [10]:
# predict validation data
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict([ts.userIdx, ts.songIdx])
pred.to_csv(path + "pred_valid/keras_song_adagrad_20_100.csv", index = False)
pred.head(5)

Unnamed: 0,row_index,is_listened
41876,41876,0.71786
41877,41877,0.767622
85967,85967,0.913623
85968,85968,0.330301
126942,126942,0.974924


### 2.3. SECOND STAGE

In [15]:
# run the estimations on full known data
model.fit([known.userIdx, known.songIdx], known.is_listened,
batch_size = int(known.shape[0]/100), epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x10579f7f0>

In [16]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = unknown.sample_id
pred["is_listened"] = model.predict([unknown.userIdx, unknown.songIdx])
pred.to_csv(path + "pred_unknown/keras_song_adagrad_20_100.csv", index = False)
pred.head(5)

Unnamed: 0,sample_id,is_listened
0,0,0.992023
1,1,0.527533
2,2,0.845849
3,3,0.52561
4,4,0.989955
