In [1]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras
#!pip install imblearn

In [2]:
# libraries
#import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Reshape, dot, add
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
from sklearn import metrics

Using Theano backend.


## 1. DATA PREPARATION

In [3]:
# load the data
path = "/Users/hauptjoh/Dropbox/DSG17/DSG_2017/"
data = pd.read_csv(path + "data/data_full.csv")
data.shape

(7578752, 57)

In [4]:
# add observation index
data["row_index"] = data.index

In [5]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (7389336, 58)
test: (169498, 58)
known: (7558834, 58)
unknown: (19918, 58)


In [6]:
# List numeric features used as predictors
print(data.columns)
numVars = ["user_ratio_flow", "user_ratio_full", "listen_type", "first_flow",
           "song_plays", "artist_plays", "platform_name1", "platform_name2",
           "song_skips", "artist_skips", "song_session_position", "time_diff"] 

# Create the data input matrix that can be passed to the keras model
tr_data = tr[[column for column in tr.columns if column in numVars]].as_matrix()
ts_data = ts[[column for column in ts.columns if column in numVars]].as_matrix()
kn_data = kn[[column for column in kn.columns if column in numVars]].as_matrix()
un_data = un[[column for column in un.columns if column in numVars]].as_matrix()

Index(['user_id', 'media_id', 'artist_id', 'genre_id', 'album_id',
       'context_type', 'media_duration', 'listen_type', 'user_gender',
       'user_age', 'is_listened', 'sample_id', 'dataset', 'time_lag',
       'session_id', 'song_session_position', 'first_flow',
       'time_diff_release_listen', 'release_year', 'is_listened_lag1',
       'is_listened_lag2', 'user_skip_ratio_last5', 'genre_equal_last_song',
       'artist_equal_last_song', 'album_equal_last_song', 'genre_plays',
       'genre_skips', 'artist_plays', 'artist_skips', 'album_plays',
       'album_skips', 'song_plays', 'song_skips', 'user_ratio_flow',
       'user_ratio_othr', 'genre_ratio', 'artist_ratio', 'song_ratio',
       'user_genre_ratio', 'user_artist_ratio', 'user_song_ratio',
       'platform_name', 'platform_family', 'hour_of_day1', 'hour_of_day2',
       'hour_of_day3', 'hour_of_day4', 'hour_of_day5', 'hour_of_day6',
       'hour_of_day7', 'hour_of_day8', 'weekdayMon', 'weekdaySat',
       'weekdaySun', '

## 2. MODELING

### 2.1. INITIALIZING

In [7]:
# Create an input layer with embeddings
user_in    = Input(shape = (1,), dtype = 'int64',   name = "user_in")
song_in    = Input(shape = (1,), dtype = 'int64',   name = "song_in")
artist_in  = Input(shape = (1,), dtype = 'int64',   name = "artist_in")
context_in = Input(shape = (1,), dtype = 'int64',   name = "context_in")

# Create an input layer with numeric features
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

# Counting number of unique ID values
n_users   = tr.user_id.nunique()
n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_context = tr.context_type.nunique()

# Create an embedding assigning k latent factors to each ID
u = Embedding(n_users,   50, input_length = 1, embeddings_regularizer = l2(1e-5))(user_in)
s = Embedding(n_songs,   50, input_length = 1, embeddings_regularizer = l2(1e-5))(song_in)
a = Embedding(n_artists, 50, input_length = 1, embeddings_regularizer = l2(1e-5))(artist_in)
c = Embedding(n_context, 50, input_length = 1, embeddings_regularizer = l2(1e-5))(context_in)

# Also create 'biases', i.e. a user and song specific value that is added
ub = Flatten()(Embedding(n_users, 1, input_length = 1)(user_in))
sb = Flatten()(Embedding(n_songs, 1, input_length = 1)(song_in))

# Layer with embeddings
x = dot([u, s], axes = 2)
x = Flatten()(x)
x = add([x, ub])
x = add([x, sb])
output = Dense(1, activation = "sigmoid")(x)

# Specify the model that we want to use
model = Model([user_in, song_in], output)
model.compile(optimizer = "Adagrad", loss = "binary_crossentropy", metrics = ['accuracy'])

### 2.2. FIRST STAGE

In [8]:
# run the estimations on training data
model.fit([tr.user_id, tr.media_id], tr.is_listened, 
validation_data = ([ts.user_id, ts.media_id], ts.is_listened),
 batch_size = int(len(tr)/100), epochs = 30) # 

Train on 7389336 samples, validate on 169498 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x10fc2eb00>

In [9]:
# predict validation data
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict([ts.user_id, ts.context_type])
pred.to_csv(path + "pred_valid/keras_newdata_full_simpleDotRecommender50_adagrad_30ep.csv", index = False)
pred.head(5)

Unnamed: 0,row_index,is_listened
102,102,0.92996
108,108,0.923464
207,207,0.966134
415,415,0.923464
513,513,0.923464


In [10]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred.is_listened)

0.66190419896941077

In [11]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
song_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 50)         681500                                       
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 50)         3770900                                      
___________________________________________________________________________________________

In [12]:
user_bias = pd.DataFrame(data = model.layers[5].get_weights()[0], columns = ["user_bias"])
song_bias = pd.DataFrame(data = model.layers[8].get_weights()[0], columns = ["song_bias"])
user_embeddings = pd.DataFrame(data = model.layers[2].get_weights()[0])
song_embeddings = pd.DataFrame(data = model.layers[3].get_weights()[0])

In [13]:
user_bias.to_csv(path + "data/user_bias_recommender0519.csv", index = True)
song_bias.to_csv(path + "data/song_bias_recommender0519.csv", index = True)
user_embeddings.to_csv(path + "data/user_embeddings_recommender0519.csv", index = True)
song_embeddings.to_csv(path + "data/song_embeddings_recommender0519.csv", index = True)

### 2.3. SECOND STAGE

In [14]:
# run the estimations on full known data
model.fit([kn.user_id, kn.media_id], kn.is_listened,
batch_size = int(kn.shape[0]/100), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b0e3f710>

In [15]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict([un.user_id, un.context_type])
pred = pred.sort_values("sample_id")
pred.to_csv(path + "pred_unknown/keras_newdata_full_simpleDotRecommender50_adagrad_30ep.csv", index = False)
pred.head(5)

Unnamed: 0,sample_id,is_listened
7551764,0,0.889594
6913498,1,0.838145
6529338,2,0.876226
5409308,3,0.730301
6218645,4,0.905385
