# **IMPORT LIBS**

## Import

In [88]:
# These imports are needed for Python 2.7
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

# To read from file
import os
import sys

# For data manipulation
import numpy as np
import pandas as pd
from scipy import sparse

# **GENERATE THE DATASET**

## Init

In [89]:
# Set the seed for reproducibility
seed = 2384795
np.random.seed(seed=seed)

# Preparing folder for output data
output_name = f"./generated_data/"
if os.path.exists(output_name) == False:
    os.makedirs(output_name)

# Directory with the original files that we used as a guide
DATA_DIR = './original_files/'

## Load the dataset

In [91]:
# Load training and validation set
tr_vd_data = pd.read_csv(os.path.join(DATA_DIR, 'train.ascii'), sep=" ", header=None, engine="python")
tr_vd_data = pd.DataFrame({"userId": sparse.coo_matrix(tr_vd_data).row,
                           "songId": sparse.coo_matrix(tr_vd_data).col,
                           "rating": sparse.coo_matrix(tr_vd_data).data})

# Load test set
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test.ascii'), sep=" ", header=None, engine="python")
test_data = pd.DataFrame({"userId": sparse.coo_matrix(test_data).row,
                          "songId": sparse.coo_matrix(test_data).col,
                          "rating": sparse.coo_matrix(test_data).data})

## Define auxiliary functions

In this notebook we are using the following function, which was included in the original notebook, to split the dataset. It should not make any difference.

In [94]:
# To split training set into training and validation set
def split_train_test_proportion(data, uid, test_prop=0.5, random_seed=0):
    data_grouped_by_user = data.groupby(uid)
    tr_list, te_list = list(), list()

    np.random.seed(random_seed)

    for u, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if u % 5000 == 0:
            print("%d users sampled" % u)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

While this function performs the COUNT after a GROUPBY id.

In [95]:
# Returns a dataframe with the count for each id in id_label
def get_count(data, id_label):
    playcount_groupbyid = data[[id]].groupby(id_label, as_index=False)
    count = playcount_groupbyid.size()
    return count

## Convert to implicit

Following the same procedure as we did with Yahoo dataset.

In [98]:
# Suggested on the original yahoo's paper
POSITIVE_THRESHOLD = 4

# Compute implicit rating for training set
tr_vd_data['ImplicitRating'] = np.where(tr_vd_data['rating'] >= POSITIVE_THRESHOLD, 1, 0)
tr_vd_data = tr_vd_data.drop(['rating'],axis=1).rename({"ImplicitRating":"rating"}, axis='columns')

# Compute implicit rating for test set
test_data['ImplicitRating'] = np.where(test_data['rating'] >= POSITIVE_THRESHOLD, 1, 0)
test_data = test_data.drop(['rating'],axis=1).rename({"ImplicitRating":"rating"}, axis='columns')

# Visualize
tr_vd_data.head(10), test_data.head(10)

## Some stats

Print the dimension of the two datasets.

In [None]:
tr_vd_data.shape, test_data.shape

Compute number of observed items per each user and total number of observations of each item.

In [111]:
# Count
user_activity = get_count(tr_vd_data, 'userId')
item_popularity = get_count(tr_vd_data, 'songId')

# Get ids of users and items
n_users = len(user_activity.index)
n_items = len(item_popularity.index)

n_users, n_items

(290, 300)

# **Da qui in avanti non ho più voglia di guardare cosa stai facendo, vedo domani.**

## Removing eventual songs and users from the test set not present in the training set

In [112]:
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [113]:
# for the test set, only keep the users/items from the training set

test_data = test_data.loc[test_data['userId'].isin(unique_uid)]
test_data = test_data.loc[test_data['songId'].isin(unique_sid)]

## Turn userId and songId to 0-based index

In [114]:
def numerize(tp):
    uid = list(map(lambda x: user2id[x], tp['userId']))
    sid = list(map(lambda x: song2id[x], tp['songId']))
    tp.loc[:, 'user_id'] = uid
    tp.loc[:, 'item_id'] = sid
    return tp[['user_id', 'item_id', 'rating']]

In [115]:
tr_vd_data = numerize(tr_vd_data)
test_data = numerize(test_data)

## Do we need the validation for our purpose?

In [127]:
#train_data, vad_data = split_train_test_proportion(tr_vd_data, 'user_id', test_prop=0.7, random_seed=12345)
#obs_test_data, vad_data = split_train_test_proportion(vad_data, 'user_id', test_prop=0.5, random_seed=12345)
train_data, obs_test_data = split_train_test_proportion(tr_vd_data, 'user_id', test_prop=0.7, random_seed=12345)


0 users sampled


In [128]:
print("There are total of %d unique users in the training set and %d unique users in the entire dataset" % (len(pd.unique(train_data['user_id'])), len(unique_uid)))

There are total of 290 unique users in the training set and 290 unique users in the entire dataset


In [129]:
print("There are total of %d unique items in the training set and %d unique items in the entire dataset" % (len(pd.unique(train_data['item_id'])), len(unique_sid)))

There are total of 297 unique items in the training set and 300 unique items in the entire dataset


In [120]:
def move_to_fill(part_data_1, part_data_2, unique_id, key):
    # move the data from part_data_2 to part_data_1 so that part_data_1 has the same number of unique "key" as unique_id
    part_id = set(pd.unique(part_data_1[key]))
    
    left_id = list()
    for i, _id in enumerate(unique_id):
        if _id not in part_id:
            left_id.append(_id)
            
    move_idx = part_data_2[key].isin(left_id)
    part_data_1 = part_data_1.append(part_data_2[move_idx])
    part_data_2 = part_data_2[~move_idx]
    return part_data_1, part_data_2

The move_to_fill function is used to ensure that train_data ends up with a complete set of unique IDs as specified by unique_id, by "moving" the necessary rows from another dataset (part_data_2 like vad_data or obs_test_data) and updating both DataFrames accordingly.

In [130]:
#train_data, vad_data = move_to_fill(train_data, vad_data, np.arange(n_items), 'item_id')
train_data, obs_test_data = move_to_fill(train_data, obs_test_data, np.arange(n_items), 'item_id')

In [131]:
print("There are total of %d unique items in the training set and %d unique items in the entire dataset" % (len(pd.unique(train_data['item_id'])), len(unique_sid)))

There are total of 300 unique items in the training set and 300 unique items in the entire dataset


## Store datasets in csv files

In [132]:
train_data.to_csv(os.path.join(output_name, 'train.csv'), index=False)
#vad_data.to_csv(os.path.join(output_name, 'validation.csv'), index=False)
tr_vd_data.to_csv(os.path.join(output_name, 'train_full.csv'), index=False)

In [124]:
obs_test_data.to_csv(os.path.join(output_name, 'obs_test_full.csv'), index=False)
test_data.to_csv(os.path.join(output_name, 'test_full.csv'), index=False)

now *obs_test_data* is our biased testset extracted by the original dataset, while *test_data* is our unbiased test set 

In [125]:
obs_test_data

Unnamed: 0,user_id,item_id,rating
3,0,171,0
7,0,228,1
10,0,236,1
11,0,246,0
15,0,251,1
...,...,...,...
6943,289,50,0
6945,289,64,0
6952,289,119,0
6954,289,125,0
