# 1. LOADING

In [1]:
# import packages
import pandas as pd
import graphlab as gl
from sklearn import metrics
from scipy import stats

In [2]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
data = pd.read_csv(path + "data/data_full.csv")
data.shape

(7578752, 54)

In [3]:
# print variable names
#data.columns

# 2. DATA PREPARATION

In [4]:
# adding row index
data["row_index"] = data.index

In [5]:
# converting factors to strings
data["user_id"]       = data.user_id.astype(str)
data["media_id"]      = data.media_id.astype(str)
data["album_id"]      = data.album_id.astype(str)
data["artist_id"]     = data.artist_id.astype(str)
data["genre_id"]      = data.genre_id.astype(str)
data["context_type"]  = data.context_type.astype(str)
data["platform_name"] = data.platform_name.astype(str)
data["user_gender"]   = data.user_gender.astype(str)
data["release_year"]  = data.release_year.astype(str)
data["listen_type"]   = data.listen_type.astype(str)
data["hour_of_day"]   = data.hour_of_day.astype(str)
data["weekday"]       = data.weekday.astype(str)
data["first_flow"]    = data.first_flow.astype(str)

In [6]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# drop skipped songs from training
#tr = tr.query("is_listened == 1")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (7519013, 55)
test: (39821, 55)
known: (7558834, 55)
unknown: (19918, 55)


In [7]:
# data sets with user-media interactions
tr_listen = gl.SFrame(tr[["media_id", "user_id", "is_listened", 
                          "context_type", "platform_name", "listen_type", "first_flow", "hour_of_day", "weekday"]])
ts_listen = gl.SFrame(ts[["media_id", "user_id", "is_listened", 
                          "context_type", "platform_name", "listen_type", "first_flow", "hour_of_day", "weekday"]])
kn_listen = gl.SFrame(kn[["media_id", "user_id", "is_listened", 
                          "context_type", "platform_name", "listen_type", "first_flow", "hour_of_day", "weekday"]])
un_listen = gl.SFrame(un[["media_id", "user_id", "is_listened", 
                          "context_type", "platform_name", "listen_type", "first_flow", "hour_of_day", "weekday"]])

This non-commercial license of GraphLab Create for academic use is assigned to kozodoin@hu-berlin.de and will expire on May 12, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1494931179.log


In [8]:
# additional user-level and media-level data
user_data = data.groupby(["user_id"]).head(1)
song_data = data.groupby(["media_id"]).head(1)

# clearing the memory
data = "Null"
tr   = "Null"
kn   = "Null"

# converting data sets
user_data = gl.SFrame(user_data[["user_id", "user_gender", "user_age", "user_ratio_flow", "user_ratio_full"]])
song_data = gl.SFrame(song_data[["media_id", "genre_id", "artist_id", "release_year", "media_duration"]])

# 3. MODELING

In [None]:
# model parameters
n_factors = 50
m_epochs = 400
regular = 1e-5

## 3.1. TRAINING-VALIDATION

In [None]:
# training the model
model = gl.recommender.factorization_recommender.create(tr_listen, 
                                                        binary_target = True, 
                                                        num_factors = n_factors,
                                                        user_id = "user_id", 
                                                        item_id = "media_id", 
                                                        target  = "is_listened",
                                                        user_data = user_data, 
                                                        item_data = song_data,
                                                        max_iterations = m_epochs,
                                                        regularization = regular)

In [None]:
# forecasting
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict(ts_listen)
pred.head(3)

In [None]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred["is_listened"])

In [None]:
# saving prediction vector
pred.to_csv(path + "pred_valid/factorization_full_type_moredata_50f_400i.csv", index = False)

In [None]:
# clearing the memory
tr_listen = "Null"
ts_listen = "Null"
ts = "Null"

## 3.2. KNOWN-UNKNOWN

In [None]:
# training the model
model = gl.recommender.factorization_recommender.create(kn_listen, 
                                                        binary_target = True, 
                                                        num_factors = n_factors,
                                                        user_id = "user_id", 
                                                        item_id = "media_id", 
                                                        target  = "is_listened",
                                                        user_data = user_data, 
                                                        item_data = song_data,
                                                        max_iterations = m_epochs,
                                                        regularization = regular)

In [None]:
# forecasting
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict(un_listen)
pred = pred.sort_values("sample_id")
pred.head(3)

In [None]:
# saving prediction vector
pred.to_csv(path + "pred_unknown/factorization_full_type_moredata_50f_400i.csv", index = False)