In [33]:
# libraries
#!pip install rpy2
#!pip install pandas
#!pip install keras
#!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced-learn-0.2.1.tar.gz (90kB)
[K    100% |████████████████████████████████| 92kB 660kB/s 
Building wheels for collected packages: imbalanced-learn
  Running setup.py bdist_wheel for imbalanced-learn ... [?25l- \ | / done
[?25h  Stored in directory: /Users/Kozodoi/Library/Caches/pip/wheels/b8/20/bd/0b775f7e5d413ac72562b1a5126598bcb6e0eae10da659be9f
Successfully built imbalanced-learn
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.2.1 imblearn-0.0


In [1]:
# libraries
import rpy2.robjects as robjects
import pandas as pd
import keras
from keras.layers import Input, Dense, Embedding, concatenate, Flatten, Dropout, Reshape
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.regularizers import l2
from keras.optimizers import Adam
from sklearn import metrics

Using TensorFlow backend.


## 1. DATA PREPARATION

In [2]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
data = pd.read_csv(path + "data/data_flow.csv")
data.shape

(2339529, 59)

In [3]:
# add observation index
data["row_index"] = data.index

In [4]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (2279790, 60)
test: (39821, 60)
known: (2319611, 60)
unknown: (19918, 60)


In [5]:
# List numeric features used as predictors
print(data.columns)
numVars = ["user_ratio_flow", "user_ratio_full", "listen_type", "first_flow",
           "song_plays", "artist_plays", "platform_name1", "platform_name2",
           "song_skips", "artist_skips", "song_session_position", "time_diff"] 

# Create the data input matrix that can be passed to the keras model
tr_data = tr[[column for column in tr.columns if column in numVars]].as_matrix()
ts_data = ts[[column for column in ts.columns if column in numVars]].as_matrix()
kn_data = kn[[column for column in kn.columns if column in numVars]].as_matrix()
un_data = un[[column for column in un.columns if column in numVars]].as_matrix()

Index(['user_id', 'context_type', 'media_id', 'artist_id', 'genre_id',
       'media_duration', 'listen_type', 'user_gender', 'user_age',
       'is_listened', 'sample_id', 'dataset', 'session_id',
       'song_session_position', 'first_flow', 'time_diff_release_listen',
       'release_year', 'is_listened_lag1', 'is_listened_lag2',
       'user_skip_ratio_last3', 'user_skip_ratio_last5',
       'user_skip_ratio_last10', 'genre_equal_last_song',
       'artist_equal_last_song', 'album_equal_last_song', 'genre_plays',
       'genre_skips', 'artist_plays', 'artist_skips', 'album_plays',
       'album_skips', 'song_plays', 'song_skips', 'user_ratio_flow',
       'user_ratio_full', 'genre_ratio', 'artist_ratio', 'song_ratio',
       'context_ratio', 'user_genre_ratio', 'user_artist_ratio',
       'user_song_ratio', 'user_context_ratio', 'platform_name',
       'platform_family', 'hour_of_day1', 'hour_of_day2', 'hour_of_day3',
       'hour_of_day4', 'hour_of_day5', 'hour_of_day6', 'hour_of_

## 2. MODELING

### 2.1. INITIALIZING

In [None]:
# Create an input layer with embeddings
user_in    = Input(shape = (1,), dtype = 'int64',   name = "user_in")
song_in    = Input(shape = (1,), dtype = 'int64',   name = "song_in")
artist_in  = Input(shape = (1,), dtype = 'int64',   name = "artist_in")
context_in = Input(shape = (1,), dtype = 'int64',   name = "context_in")

# Create an input layer with numeric features
data_in = Input(shape = (tr_data.shape[1],), name = "data_in")

# Counting number of unique ID values
n_users   = tr.user_id.nunique()
n_songs   = tr.media_id.nunique()
n_artists = tr.artist_id.nunique()
n_context = tr.context_type.nunique()

# Create an embedding assigning k latent factors to each ID
u = Embedding(n_users,   50, input_length = 1, embeddings_regularizer = l2(1e-5))(user_in)
s = Embedding(n_songs,   50, input_length = 1, embeddings_regularizer = l2(1e-5))(song_in)
a = Embedding(n_artists, 50, input_length = 1, embeddings_regularizer = l2(1e-5))(artist_in)
c = Embedding(n_context, 50, input_length = 1, embeddings_regularizer = l2(1e-5))(context_in)


# Layer with embeddings
embedding_input = concatenate([u, a, c])
embedding_input = Flatten()(embedding_input)
embedding_dense = Dense(128, activation = "relu")(embedding_input)
embedding_dense = BatchNormalization()(embedding_dense)

# Layer with numeric features
data_dense = Dense(16, activation = "relu")(data_in)
data_dense = BatchNormalization()(data_dense)

# Constructing the further layers
x = concatenate([embedding_dense, data_dense])
x = Dropout(0.5)(x)
x = BatchNormalization()(x)
x = Dense(64, activation='relu')(x) 
x = Dropout(0.5)(x)
output = Dense(1, activation = "sigmoid")(x)

# Specify the model that we want to use
model = Model([user_in, artist_in, context_in], output)
model.compile(optimizer = "Nadam", loss = "binary_crossentropy", metrics = ['accuracy'])

### 2.2. FIRST STAGE

In [None]:
# run the estimations on training data
model.fit([tr.user_id, tr.artist_id, tr.context_type, tr.data_in], tr.is_listened, 
validation_data = ([ts.user_id, ts.artist_id, ts.context_type, ts.data_in], ts.is_listened),
batch_size = int(len(tr)/100), epochs = 10)

Train on 2279790 samples, validate on 39821 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

In [None]:
# predict validation data
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict([ts.user_id, ts.artist_id, ts.context_type, ts.data_in])
pred.to_csv(path + "pred_valid/keras_newdata_flow_artist_context_50_nadam_10ep.csv", index = False)
pred.head(5)

In [None]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred.is_listened)

### 2.3. SECOND STAGE

In [None]:
# run the estimations on full known data
model.fit([kn.user_id, kn.artist_id, kn.context_type, kn.data_in], kn.is_listened,
batch_size = int(kn.shape[0]/100), epochs = 10)

Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x120daa668>

In [None]:
# predict unknown data
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict([un.user_id, un.artist_id, un.context_type, un.data_in])
pred = pred.sort_values("sample_id")
pred.to_csv(path + "pred_unknown/keras_newdata_flow_artist_context_50_nadam_10ep.csv", index = False)
pred.head(5)

Unnamed: 0,sample_id,is_listened
2332218,0,0.983333
2182792,1,0.488397
2092769,2,0.747803
1806368,3,0.454087
2018166,4,0.872301
