In [1]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras
#!pip install imblearn

In [16]:
# libraries
#import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Reshape, dot, add
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
from sklearn import metrics

## 1. DATA PREPARATION

In [3]:
# load the data
path = "/Users/maj/Dropbox/DSG17/DSG_2017/"
data = pd.read_csv(path + "data/data_flow.csv")
data.shape

(2339529, 70)

In [4]:
# add observation index
data["row_index"] = data.index

In [5]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (2279790, 70)
test: (39821, 70)
known: (2319611, 70)
unknown: (19918, 70)


In [6]:
# List numeric features used as predictors
print(data.columns)
numVars = ["user_ratio_flow", "user_ratio_full", "listen_type", "first_flow",
           "song_plays", "artist_plays", "platform_name1", "platform_name2",
           "song_skips", "artist_skips", "song_session_position", "time_diff"] 

# Create the data input matrix that can be passed to the keras model
tr_data = tr[[column for column in tr.columns if column in numVars]].as_matrix()
ts_data = ts[[column for column in ts.columns if column in numVars]].as_matrix()
kn_data = kn[[column for column in kn.columns if column in numVars]].as_matrix()
un_data = un[[column for column in un.columns if column in numVars]].as_matrix()

Index(['user_id', 'context_type', 'media_id', 'artist_id', 'genre_id',
       'album_id', 'media_duration', 'listen_type', 'user_gender', 'user_age',
       'is_listened', 'sample_id', 'dataset', 'row_index', 'song_rank',
       'song_bpm', 'song_position', 'song_lyrics_explicit', 'song_gain',
       'album_fans', 'session_id', 'time_lag_lag1', 'time_lag_lag2',
       'song_session_position', 'first_flow', 'time_diff_release_listen',
       'release_year', 'is_listened_lag1', 'is_listened_lag2',
       'user_skip_ratio_last3', 'user_skip_ratio_last5',
       'user_skip_ratio_last10', 'context_type_same_as_lag',
       'genre_equal_last_song', 'artist_equal_last_song',
       'album_equal_last_song', 'genre_plays', 'genre_skips', 'artist_plays',
       'artist_skips', 'album_plays', 'album_skips', 'song_plays',
       'song_skips', 'user_ratio_flow', 'user_ratio_full', 'genre_ratio',
       'artist_ratio', 'song_ratio', 'context_ratio', 'user_genre_ratio',
       'user_artist_ratio', 'u

## 2. MODELING

### 2.1. INITIALIZING

In [17]:
# Create an input layer with embeddings
user_in    = Input(shape = (1,), dtype = 'int64',   name = "user_in")
song_in    = Input(shape = (1,), dtype = 'int64',   name = "song_in")
artist_in  = Input(shape = (1,), dtype = 'int64',   name = "artist_in")
context_in = Input(shape = (1,), dtype = 'int64',   name = "context_in")

# Create an input layer with numeric features
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

# Counting number of unique ID values
n_users   = tr.user_id.nunique()
n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_context = tr.context_type.nunique()

# Create an embedding assigning k latent factors to each ID
u = Embedding(n_users,   150, input_length = 1, embeddings_regularizer = l2(1e-5))(user_in)
s = Embedding(n_songs,   150, input_length = 1, embeddings_regularizer = l2(1e-5))(song_in)
a = Embedding(n_artists, 150, input_length = 1, embeddings_regularizer = l2(1e-5))(artist_in)
c = Embedding(n_context, 150, input_length = 1, embeddings_regularizer = l2(1e-5))(context_in)

# Also create 'biases', i.e. a user and song specific value that is added
ub = Flatten()(Embedding(n_users, 1, input_length = 1)(user_in))
sb = Flatten()(Embedding(n_songs, 1, input_length = 1)(song_in))

# Layer with embeddings
x = dot([u, s], axes = 2)
x = Flatten()(x)
x = add([x, ub])
x = add([x, sb])
output = Dense(1, activation = "sigmoid")(x)

# Specify the model that we want to use
model = Model([user_in, song_in], output)
model.compile(optimizer = "Adagrad", loss = "binary_crossentropy", metrics = ['accuracy'])

### 2.2. FIRST STAGE

In [25]:
# run the estimations on training data
model.fit([tr.user_id, tr.media_id], tr.is_listened, 
validation_data = ([ts.user_id, ts.media_id], ts.is_listened),
 batch_size = int(len(tr)/100), epochs = 3) # 

Train on 2279790 samples, validate on 39821 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x111e4f898>

In [26]:
# predict validation data
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict([ts.user_id, ts.context_type])
pred.to_csv(path + "pred_valid/keras_newdata_flow_simpleDotRecommender150_adagrad_40ep.csv", index = False)
pred.head(5)

Unnamed: 0,row_index,is_listened
2166,2166,0.968515
2167,2167,0.903207
2239,2239,0.903207
4313,4313,0.901708
4320,4320,0.901708


In [27]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred.is_listened)

0.70753776351214137

In [70]:
user_bias = pd.DataFrame(data = model.layers[5].get_weights()[0], columns = ["user_bias"])
song_bias = pd.DataFrame(data = model.layers[8].get_weights()[0], columns = ["song_bias"])

In [71]:
user_bias.to_csv(path + "data/user_bias_recommender0514.csv", index = True)
user_bias.to_csv(path + "data/song_bias_recommender0514.csv", index = True)

0            0
1            0
2            0
3            0
4            0
5            0
6            0
7            0
8            0
9            0
10           0
11           0
12           0
13           0
14           0
15           0
16           0
17           0
18           0
19           0
20           0
21           0
22           0
23           0
24           0
25           0
26           0
27           0
28           0
29           0
          ... 
2338997    815
2338998    815
2339001      7
2339006    815
2339007    815
2339010    815
2339011    815
2339016    815
2339017    815
2339025      7
2339033      7
2339039      7
2339043    815
2339044    815
2339049    815
2339050    815
2339082      7
2339085      7
2339093      7
2339102      7
2339116      7
2339125      7
2339142      7
2339145      7
2339164      7
2339171      7
2339176      7
2339180      7
2339186      7
2339214      7
Name: user_id, dtype: int64

### 2.3. SECOND STAGE

In [28]:
# run the estimations on full known data
model.fit([kn.user_id, kn.media_id], kn.is_listened,
batch_size = int(kn.shape[0]/100), epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
 371136/2319611 [===>..........................] - ETA: 21s - loss: 0.4747 - acc: 0.7807

KeyboardInterrupt: 

In [None]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict([un.user_id, un.context_type])
pred = pred.sort_values("sample_id")
pred.to_csv(path + "pred_unknown/keras_newdata_flow_simpleDotRecommender150_adagrad_40ep.csv", index = False)
pred.head(5)