In [4]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras
#!pip install imblearn

In [5]:
# libraries
#import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Reshape, dot, add
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
from sklearn import metrics

## 1. DATA PREPARATION

In [6]:
# load the data
path = "/Users/maj/Dropbox/DSG17/DSG_2017/"
data = pd.read_csv(path + "data/data_full.csv")
data.shape

(7578752, 75)

In [7]:
# add observation index
data["row_index"] = data.index

In [8]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (7519013, 75)
test: (39821, 75)
known: (7558834, 75)
unknown: (19918, 75)


In [9]:
# List numeric features used as predictors
print(data.columns)
numVars = ["user_ratio_flow", "user_ratio_full", "listen_type", "first_flow",
           "song_plays", "artist_plays", "platform_name1", "platform_name2",
           "song_skips", "artist_skips", "song_session_position", "time_diff"] 

# Create the data input matrix that can be passed to the keras model
tr_data = tr[[column for column in tr.columns if column in numVars]].as_matrix()
ts_data = ts[[column for column in ts.columns if column in numVars]].as_matrix()
kn_data = kn[[column for column in kn.columns if column in numVars]].as_matrix()
un_data = un[[column for column in un.columns if column in numVars]].as_matrix()

Index(['media_id', 'user_id', 'context_type', 'artist_id', 'genre_id',
       'album_id', 'media_duration', 'listen_type', 'user_gender', 'user_age',
       'is_listened', 'sample_id', 'dataset', 'row_index', 'song_rank',
       'song_bpm', 'song_position', 'song_lyrics_explicit', 'song_gain',
       'album_fans', 'favorite_artist', 'favorite_album', 'radio_selecter',
       'session_id', 'time_lag_lag1', 'time_lag_lag2', 'song_session_position',
       'first_flow', 'time_diff_release_listen', 'release_year',
       'is_listened_lag1', 'is_listened_lag2', 'user_skip_ratio_last3',
       'user_skip_ratio_last5', 'user_skip_ratio_last10',
       'context_type_same_as_lag', 'genre_equal_last_song',
       'artist_equal_last_song', 'album_equal_last_song', 'genre_plays',
       'genre_skips', 'artist_plays', 'artist_skips', 'album_plays',
       'album_skips', 'song_plays', 'song_skips', 'user_ratio_flow',
       'user_ratio_full', 'genre_ratio', 'artist_ratio', 'song_ratio',
       'cont

## 2. MODELING

### 2.1. INITIALIZING

In [10]:
# Create an input layer with embeddings
user_in    = Input(shape = (1,), dtype = 'int64',   name = "user_in")
song_in    = Input(shape = (1,), dtype = 'int64',   name = "song_in")
artist_in  = Input(shape = (1,), dtype = 'int64',   name = "artist_in")
context_in = Input(shape = (1,), dtype = 'int64',   name = "context_in")

# Create an input layer with numeric features
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

# Counting number of unique ID values
n_users   = tr.user_id.nunique()
n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_context = tr.context_type.nunique()

# Create an embedding assigning k latent factors to each ID
u = Embedding(n_users,   50, input_length = 1, embeddings_regularizer = l2(1e-5))(user_in)
s = Embedding(n_songs,   50, input_length = 1, embeddings_regularizer = l2(1e-5))(song_in)
a = Embedding(n_artists, 50, input_length = 1, embeddings_regularizer = l2(1e-5))(artist_in)
c = Embedding(n_context, 50, input_length = 1, embeddings_regularizer = l2(1e-5))(context_in)

# Also create 'biases', i.e. a user and song specific value that is added
ub = Flatten()(Embedding(n_users, 1, input_length = 1)(user_in))
sb = Flatten()(Embedding(n_songs, 1, input_length = 1)(song_in))

# Layer with embeddings
x = dot([u, s], axes = 2)
x = Flatten()(x)
x = add([x, ub])
x = add([x, sb])
output = Dense(1, activation = "sigmoid")(x)

# Specify the model that we want to use
model = Model([user_in, song_in], output)
model.compile(optimizer = "Adagrad", loss = "binary_crossentropy", metrics = ['accuracy'])

In [14]:
print(n_users)

19505


### 2.2. FIRST STAGE

In [23]:
# run the estimations on training data
model.fit([tr.user_id, tr.media_id], tr.is_listened, 
validation_data = ([ts.user_id, ts.media_id], ts.is_listened),
 batch_size = int(len(tr)/100), epochs = 5) # 

Train on 7519013 samples, validate on 39821 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1cd3712e8>

In [16]:
# predict validation data
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict([ts.user_id, ts.context_type])
pred.to_csv(path + "pred_valid/keras_newdata_full_simpleDotRecommender50_adagrad_25ep.csv", index = False)
pred.head(5)

Unnamed: 0,row_index,is_listened
241,241,0.921215
1559,1559,0.681611
1657,1657,0.819947
1665,1665,0.76109
1666,1666,0.76109


In [17]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred.is_listened)

0.65473497808416758

In [18]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
user_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
song_in (InputLayer)             (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 1, 50)         975250      user_in[0][0]                    
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 1, 50)         12351900    song_in[0][0]                    
___________________________________________________________________________________________

In [24]:
user_bias = pd.DataFrame(data = model.layers[5].get_weights()[0], columns = ["user_bias"])
song_bias = pd.DataFrame(data = model.layers[8].get_weights()[0], columns = ["song_bias"])
user_embeddings = pd.DataFrame(data = model.layers[2].get_weights()[0])
song_embeddings = pd.DataFrame(data = model.layers[3].get_weights()[0])

In [25]:
user_bias.to_csv(path + "data/user_bias_recommender0519.csv", index = True)
song_bias.to_csv(path + "data/song_bias_recommender0519.csv", index = True)
user_embeddings.to_csv(path + "data/user_embeddings_recommender0519.csv", index = True)
song_embeddings.to_csv(path + "data/song_embeddings_recommender0519.csv", index = True)

### 2.3. SECOND STAGE

In [21]:
# run the estimations on full known data
model.fit([kn.user_id, kn.media_id], kn.is_listened,
batch_size = int(kn.shape[0]/100), epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1cd372ef0>

In [22]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict([un.user_id, un.context_type])
pred = pred.sort_values("sample_id")
pred.to_csv(path + "pred_unknown/keras_newdata_full_simpleDotRecommender50_adagrad_25ep.csv", index = False)
pred.head(5)

Unnamed: 0,sample_id,is_listened
7022297,0,0.861538
3513324,1,0.827026
3513295,2,0.876625
4452282,3,0.742794
6338648,4,0.88828
