# 1. LOADING

In [1]:
# import packages
import pandas as pd
import graphlab as gl
from sklearn import metrics
from scipy import stats

In [2]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
data = pd.read_csv(path + "data/data_full.csv")
data.shape

(7578752, 64)

In [3]:
# print variable names
data.columns

Index([u'user_id', u'context_type', u'media_id', u'artist_id', u'genre_id',
       u'album_id', u'ts_listen', u'release_date', u'platform_name',
       u'platform_family', u'media_duration', u'listen_type', u'user_gender',
       u'user_age', u'is_listened', u'sample_id', u'dataset', u'row_index',
       u'song_rank', u'song_bpm', u'song_position', u'song_lyrics_explicit',
       u'song_gain', u'album_fans', u'favorite_artist', u'favorite_album',
       u'radio_selecter', u'time_lag', u'session_id', u'time_lag_lag1',
       u'time_lag_lag2', u'song_session_position', u'first_flow',
       u'time_diff_release_listen', u'hour_of_day', u'weekday',
       u'release_year', u'is_listened_lag1', u'is_listened_lag2',
       u'user_skip_ratio_last3', u'user_skip_ratio_last5',
       u'user_skip_ratio_last10', u'context_type_same_as_lag',
       u'genre_equal_last_song', u'artist_equal_last_song',
       u'album_equal_last_song', u'genre_plays', u'genre_skips',
       u'artist_plays', u'artist_s

# 2. DATA PREPARATION

In [4]:
# converting factors to strings
data["user_id"]   = data.user_id.astype(str)
data["media_id"]  = data.media_id.astype(str)
data["album_id"]  = data.album_id.astype(str)
data["artist_id"] = data.artist_id.astype(str)
data["genre_id"]  = data.genre_id.astype(str)
data["context_type"]    = data.context_type.astype(str)
data["listen_type"]     = data.listen_type.astype(str)
data["first_flow"]      = data.first_flow.astype(str)
data["platform_name"]   = data.platform_name.astype(str)
data["platform_family"] = data.platform_family.astype(str)
data["user_gender"]  = data.user_gender.astype(str)
data["release_year"] = data.release_year.astype(str)
data["hour_of_day"]  = data.hour_of_day.astype(str)
data["weekday"]      = data.weekday.astype(str)
data["favorite_artist"] = data.favorite_artist.astype(str)
data["favorite_album"]  = data.favorite_album.astype(str)
data["song_lyrics_explicit"]  = data.song_lyrics_explicit.astype(str)

In [5]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (7519013, 64)
test: (39821, 64)
known: (7558834, 64)
unknown: (19918, 64)


In [6]:
# data sets with user-media interactions
tr_listen = gl.SFrame(tr[["media_id", "user_id", "is_listened", "listen_type",
                          "context_type", "platform_name", "favorite_artist", "favorite_album"]])
ts_listen = gl.SFrame(ts[["media_id", "user_id", "is_listened", "listen_type",
                          "context_type", "platform_name", "favorite_artist", "favorite_album"]])
kn_listen = gl.SFrame(kn[["media_id", "user_id", "is_listened", "listen_type",
                          "context_type", "platform_name", "favorite_artist", "favorite_album"]])
un_listen = gl.SFrame(un[["media_id", "user_id", "is_listened", "listen_type",
                          "context_type", "platform_name", "favorite_artist", "favorite_album"]])

This non-commercial license of GraphLab Create for academic use is assigned to kozodoin@hu-berlin.de and will expire on May 12, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1495317684.log


In [7]:
# additional user-level and media-level data
user_data = data.groupby(["user_id"]).head(1)
song_data = data.groupby(["media_id"]).head(1)

# clearing the memory
data = "Null"
tr   = "Null"
kn   = "Null"

# converting data sets
user_data = gl.SFrame(user_data[["user_id", "user_gender", "user_age", 
                                 "user_ratio_flow", "user_ratio_full"]])
song_data = gl.SFrame(song_data[["media_id", "genre_id", 
                                 "release_year", "media_duration", 
                                 "song_bpm", "song_rank", "album_fans"]])

# 3. MODELING

In [None]:
# model parameters
n_factors = 50
m_epochs = 300
regular = 1e-5

## 3.1. TRAINING-VALIDATION

In [None]:
# training the model
model = gl.recommender.factorization_recommender.create(tr_listen, 
                                                        binary_target = True, 
                                                        num_factors = n_factors,
                                                        user_id = "user_id", 
                                                        item_id = "media_id", 
                                                        target  = "is_listened",
                                                        user_data = user_data, 
                                                        item_data = song_data,
                                                        max_iterations = m_epochs,
                                                        regularization = regular)

In [None]:
# forecasting
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict(ts_listen)
pred.head(3)

Unnamed: 0,row_index,is_listened
5988,6060544,0.992
6052,3968455,0.979357
6295,5584470,0.988625


In [None]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred["is_listened"])

0.7168625357179379

In [None]:
# saving prediction vector
pred.to_csv(path + "pred_valid/factorization_full_api_50f_300i.csv", index = False)

In [None]:
# clearing the memory
tr_listen = "Null"
ts_listen = "Null"
ts = "Null"

## 3.2. KNOWN-UNKNOWN

In [None]:
# training the model
model = gl.recommender.factorization_recommender.create(kn_listen, 
                                                        binary_target = True, 
                                                        num_factors = n_factors,
                                                        user_id = "user_id", 
                                                        item_id = "media_id", 
                                                        target  = "is_listened",
                                                        user_data = user_data, 
                                                        item_data = song_data,
                                                        max_iterations = m_epochs,
                                                        regularization = regular)

In [None]:
# forecasting
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict(un_listen)
pred = pred.sort_values("sample_id")
pred.head(3)

Unnamed: 0,sample_id,is_listened
7551764,0,0.985816
6913498,1,0.750996
6529338,2,0.800969


In [None]:
# saving prediction vector
pred.to_csv(path + "pred_unknown/factorization_full_api_50f_300i.csv", index = False)