# 1. LOADING

In [1]:
# import packages
import pandas as pd
import graphlab as gl
from sklearn import metrics
from scipy import stats

In [2]:
# load the data
path = "/Users/Kozodoi/Documents/Competitions/DSG_2017/"
data = pd.read_csv(path + "data/data_flow.csv")
data.shape

(2339529, 50)

In [3]:
# print variable names
#data.columns

# 2. DATA PREPARATION

In [4]:
# converting factors to strings
data["user_id"]   = data.user_id.astype(str)
data["media_id"]  = data.media_id.astype(str)

In [5]:
# data partitioning
tr = data.query("dataset == 'train'")
ts = data.query("dataset == 'test'")
kn = data.query("dataset != 'unknown'")
un = data.query("dataset == 'unknown'")

# print data sizes
print("train: "   + str(tr.shape))
print("test: "    + str(ts.shape))
print("known: "   + str(kn.shape))
print("unknown: " + str(un.shape))

train: (2279790, 50)
test: (39821, 50)
known: (2319611, 50)
unknown: (19918, 50)


In [6]:
# data sets with user-media interactions
tr_listen = gl.SFrame(tr[["media_id", "user_id", "is_listened"]])
ts_listen = gl.SFrame(ts[["media_id", "user_id", "is_listened"]])
kn_listen = gl.SFrame(kn[["media_id", "user_id", "is_listened"]])
un_listen = gl.SFrame(un[["media_id", "user_id", "is_listened"]])

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1495913947.log


This non-commercial license of GraphLab Create for academic use is assigned to kozodoin@hu-berlin.de and will expire on May 12, 2018.


In [7]:
# clearing the memory
data = "Null"
tr   = "Null"
kn   = "Null"

# 3. MODELING

## 3.1. TRAINING-VALIDATION

In [None]:
# model parameters
k = 25500
sim_metric = "cosine"

In [None]:
# training the model
model = gl.recommender.item_similarity_recommender.create(tr_listen, 
                                                          only_top_k = k,
                                                          similarity_type = sim_metric, 
                                                          user_id = "user_id", 
                                                          item_id = "media_id", 
                                                          target  = "is_listened",
                                                          training_method = "nn:sparse")

In [None]:
# forecasting
pred = pd.DataFrame()
pred["row_index"] = ts.row_index
pred["is_listened"] = model.predict(ts_listen)
pred.head(3)

In [None]:
# computing AUC
metrics.roc_auc_score(ts.is_listened, pred["is_listened"])

In [None]:
# saving prediction vector
pred.to_csv(path + "pred_valid/similarity_scores_flow_25500k.csv", index = False)

In [None]:
# clearing the memory
tr_listen = "Null"
ts = "Null"

## 3.2. KNOWN-UNKNOWN

In [None]:
# training the model
model = gl.recommender.item_similarity_recommender.create(kn_listen, 
                                                          only_top_k = k,
                                                          similarity_type = sim_metric, 
                                                          user_id = "user_id", 
                                                          item_id = "media_id", 
                                                          target  = "is_listened")

In [None]:
# forecasting
pred = pd.DataFrame()
pred["sample_id"] = un.sample_id.astype(int)
pred["is_listened"] = model.predict(un_listen)
pred = pred.sort_values("sample_id")
pred.head(3)

In [None]:
# saving prediction vector
pred.to_csv(path + "pred_unknown/similarity_scores_flow_25000k.csv", index = False)