In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import sys

# import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import numpy.random as npr
from scipy import sparse, stats

In [2]:
# Set the seed for reproducibility
seed = 2384795
np.random.seed(seed=seed)

# Preparing folder for output data
output_name = f"./generated_data/"
if os.path.exists(output_name) == False:
    os.makedirs(output_name)

In [3]:
DATA_DIR = './original_files/'

## Loading Datasets

In [4]:
raw_data = pd.read_csv(os.path.join(DATA_DIR, 'train.ascii'), sep=" ", header=None, engine="python")
test_data = pd.read_csv(os.path.join(DATA_DIR, 'test.ascii'), sep=" ", header=None, engine="python")

In [5]:
tr_vd_data = pd.DataFrame({"userId": sparse.coo_matrix(raw_data).row,                            "songId": sparse.coo_matrix(raw_data).col,                           "rating": sparse.coo_matrix(raw_data).data})

In [6]:
test_data = pd.DataFrame({"userId": sparse.coo_matrix(test_data).row,                            "songId": sparse.coo_matrix(test_data).col,                           "rating": sparse.coo_matrix(test_data).data})

## Defining functions

In [7]:
def split_train_test_proportion(data, uid, test_prop=0.5, random_seed=0):
    data_grouped_by_user = data.groupby(uid)
    tr_list, te_list = list(), list()

    np.random.seed(random_seed)

    for u, (_, group) in enumerate(data_grouped_by_user):
        n_items_u = len(group)

        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool')
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True

            tr_list.append(group[np.logical_not(idx)])
            te_list.append(group[idx])
        else:
            tr_list.append(group)

        if u % 5000 == 0:
            print("%d users sampled" % u)
            sys.stdout.flush()

    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)
    
    return data_tr, data_te

In [8]:
def get_count(tp, id):
    playcount_groupbyid = tp[[id]].groupby(id, as_index=False)
    count = playcount_groupbyid.size()
    return count

## Make dataset implicit

In [9]:
tr_vd_data.head(10)

Unnamed: 0,userId,songId,rating
0,0,72,2
1,0,136,2
2,0,150,3
3,0,171,3
4,0,188,3
5,0,220,3
6,0,227,5
7,0,228,4
8,0,234,3
9,0,235,4


In [10]:
test_data.head(10)

Unnamed: 0,userId,songId,rating
0,0,12,4
1,0,17,3
2,0,74,4
3,0,78,2
4,0,92,2
5,0,104,4
6,0,127,4
7,0,128,3
8,0,133,3
9,0,145,2


In [11]:
# Suggested on the original yahoo's paper
POSITIVE_THRESHOLD = 4

# Add column to the DataFrame
tr_vd_data['ImplicitRating'] = np.where(tr_vd_data['rating'] >= POSITIVE_THRESHOLD, 1, 0)
test_data['ImplicitRating'] = np.where(test_data['rating'] >= POSITIVE_THRESHOLD, 1, 0)

In [12]:
tr_vd_data.head(10)

Unnamed: 0,userId,songId,rating,ImplicitRating
0,0,72,2,0
1,0,136,2,0
2,0,150,3,0
3,0,171,3,0
4,0,188,3,0
5,0,220,3,0
6,0,227,5,1
7,0,228,4,1
8,0,234,3,0
9,0,235,4,1


In [13]:
tr_vd_data = tr_vd_data.drop(['rating'],axis=1).rename({"ImplicitRating":"rating"}, axis='columns')

In [14]:
tr_vd_data.head(10)

Unnamed: 0,userId,songId,rating
0,0,72,0
1,0,136,0
2,0,150,0
3,0,171,0
4,0,188,0
5,0,220,0
6,0,227,1
7,0,228,1
8,0,234,0
9,0,235,1


In [15]:
test_data.head(10)

Unnamed: 0,userId,songId,rating,ImplicitRating
0,0,12,4,1
1,0,17,3,0
2,0,74,4,1
3,0,78,2,0
4,0,92,2,0
5,0,104,4,1
6,0,127,4,1
7,0,128,3,0
8,0,133,3,0
9,0,145,2,0


In [16]:
test_data = test_data.drop(['rating'],axis=1).rename({"ImplicitRating":"rating"}, axis='columns')

In [17]:
test_data.head(10)

Unnamed: 0,userId,songId,ImplicitRating
0,0,12,1
1,0,17,0
2,0,74,1
3,0,78,0
4,0,92,0
5,0,104,1
6,0,127,1
7,0,128,0
8,0,133,0
9,0,145,0


## Some stats

In [18]:
tr_vd_data.head(), tr_vd_data.shape

(   userId  songId  rating
 0       0      72       0
 1       0     136       0
 2       0     150       0
 3       0     171       0
 4       0     188       0,
 (6960, 3))

In [19]:
test_data.head(), test_data.shape

(   userId  songId  ImplicitRating
 0       0      12               1
 1       0      17               0
 2       0      74               1
 3       0      78               0
 4       0      92               0,
 (4640, 3))

In [20]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4640 entries, 0 to 4639
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   userId          4640 non-null   int32
 1   songId          4640 non-null   int32
 2   ImplicitRating  4640 non-null   int64
dtypes: int32(2), int64(1)
memory usage: 72.6 KB


In [21]:
user_activity = get_count(tr_vd_data, 'userId')
item_popularity = get_count(tr_vd_data, 'songId')

In [22]:
unique_uid = user_activity.index
unique_sid = item_popularity.index

In [23]:
n_users = len(unique_uid)
n_items = len(unique_sid)

In [24]:
n_users, n_items

(290, 300)

## Removing eventual songs and users from the test set not present in the training set

In [25]:
song2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((uid, i) for (i, uid) in enumerate(unique_uid))

In [26]:
# for the test set, only keep the users/items from the training set

test_data = test_data.loc[test_data['userId'].isin(unique_uid)]
test_data = test_data.loc[test_data['songId'].isin(unique_sid)]

## Turn userId and songId to 0-based index

In [27]:
def numerize(tp):
    uid = list(map(lambda x: user2id[x], tp['userId']))
    sid = list(map(lambda x: song2id[x], tp['songId']))
    tp.loc[:, 'uid'] = uid
    tp.loc[:, 'sid'] = sid
    return tp[['uid', 'sid', 'rating']]

In [28]:
tr_vd_data = numerize(tr_vd_data)
test_data = numerize(test_data)

KeyError: "['rating'] not in index"

## Do we really need the validation for our purpose?

In [None]:
train_data, vad_data = split_train_test_proportion(tr_vd_data, 'uid', test_prop=0.6, random_seed=12345)
obs_test_data, vad_data = split_train_test_proportion(vad_data, 'uid', test_prop=0.5, random_seed=12345)

0 users sampled
0 users sampled


In [None]:
print("There are total of %d unique users in the training set and %d unique users in the entire dataset" % (len(pd.unique(train_data['uid'])), len(unique_uid)))

There are total of 290 unique users in the training set and 290 unique users in the entire dataset


In [None]:
print("There are total of %d unique items in the training set and %d unique items in the entire dataset" % (len(pd.unique(train_data['sid'])), len(unique_sid)))

There are total of 300 unique items in the training set and 300 unique items in the entire dataset


In [None]:
def move_to_fill(part_data_1, part_data_2, unique_id, key):
    # move the data from part_data_2 to part_data_1 so that part_data_1 has the same number of unique "key" as unique_id
    part_id = set(pd.unique(part_data_1[key]))
    
    left_id = list()
    for i, _id in enumerate(unique_id):
        if _id not in part_id:
            left_id.append(_id)
            
    move_idx = part_data_2[key].isin(left_id)
    part_data_1 = part_data_1.append(part_data_2[move_idx])
    part_data_2 = part_data_2[~move_idx]
    return part_data_1, part_data_2

The move_to_fill function is used to ensure that train_data ends up with a complete set of unique IDs as specified by unique_id, by "moving" the necessary rows from another dataset (part_data_2 like vad_data or obs_test_data) and updating both DataFrames accordingly.

In [None]:
train_data, vad_data = move_to_fill(train_data, vad_data, np.arange(n_items), 'sid')
train_data, obs_test_data = move_to_fill(train_data, obs_test_data, np.arange(n_items), 'sid')

In [None]:
print("There are total of %d unique items in the training set and %d unique items in the entire dataset" % (len(pd.unique(train_data['sid'])), len(unique_sid)))

There are total of 300 unique items in the training set and 300 unique items in the entire dataset


## Store datasets in csv files

In [None]:
train_data.to_csv(os.path.join(output_name, 'train.csv'), index=False)
vad_data.to_csv(os.path.join(output_name, 'validation.csv'), index=False)
tr_vd_data.to_csv(os.path.join(output_name, 'train_full.csv'), index=False)

In [None]:
obs_test_data.to_csv(os.path.join(output_name, 'obs_test_full.csv'), index=False)
test_data.to_csv(os.path.join(output_name, 'test_full.csv'), index=False)

now *obs_test_data* is our biased testset extracted by the original dataset, while *test_data* is our unbiased test set 