# LOADING

In [102]:
# import packages
import pandas as pd
import graphlab as gl
from sklearn import metrics

In [49]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
data = pd.read_csv(path + "data/data_flow.csv")
data.shape

(2339529, 59)

# DATA PREPARATION

In [141]:
# separate user-level data
user_data  = data.groupby(["user_id"]).mean()
media_data = data.groupby(["media_id"]).mean()
user_data["user_id"]   = user_data.index
media_data["media_id"] = media_data.index

In [142]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")

# drop skipped songs from training
#tr = tr.query("is_listened == 1")

# print data sizes
print("train: " + str(tr.shape))
print("test: "  + str(ts.shape))

train: (2279790, 59)
test: (39821, 59)


In [143]:
# data sets with user-media interactions
tr_listen = gl.SFrame(tr[["context_type", "user_id", "is_listened"]])
ts_listen = gl.SFrame(ts[["context_type", "user_id", "is_listened"]])

# data sets with user and media data
user_data  = gl.SFrame(user_data[["user_id", "user_gender", "user_age"]])
media_data = gl.SFrame(media_data[["media_id", "media_duration", "release_year"]])

# MODELING

In [163]:
# training the model
model = gl.recommender.factorization_recommender.create(tr_listen, 
                                                        binary_target = True, 
                                                        num_factors = 50,
                                                        user_id = "user_id", 
                                                        item_id = "context_type", 
                                                        target = "is_listened",
                                                        #user_data = user_data, 
                                                        #item_data = media_data,
                                                        max_iterations = 50,
                                                        regularization = 1e-5)

# FORECASTING

In [164]:
# forecasting
ts_prediction = model.predict(ts_listen)

In [165]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, ts_prediction)

0.72277040611216414