In [1]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as spr
from collections import Counter
from scipy.sparse import hstack, vstack
from tqdm.auto import tqdm

song_meta = pd.read_json("./../data/song_meta.json")
train = pd.read_json("./../data/train.json")
test = pd.read_json("./../data/val.json")

In [2]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [3]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

n_songs = len(song_dict)

In [4]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]

# 곡의 개수와 태그의 개수를 할당
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
plylst_use.shape

(138086, 6)

In [16]:
user_ids   = np.array(())
item_ids   = np.array(())
ratings    = np.array(())
timestamps = np.array(())

max_num_songs = max(plylst_use.num_songs)

# for i in range(5):
#     line = plylst_train.iloc[i, :]
#     print(line.num_songs)

for i, line in tqdm(enumerate(plylst_use.iterrows()), total=plylst_use.shape[0]):
    data = line[1]
    temp_user_ids   = np.repeat(i, data.num_songs)
    # 나중에 1 빼주면 됩니다 / 0 패딩을 하니까 이거랑 아이템 아이디랑 구분이 되지 않아서 에러 발생 방지
    temp_item_ids   = np.array(data.songs_id) + 1
    temp_ratings    = np.repeat(1, data.num_songs)
    temp_timestamps = np.arange(data.num_songs)
    
    user_ids = np.hstack([user_ids, temp_user_ids])
    item_ids = np.hstack([item_ids, temp_item_ids])
    ratings  = np.hstack([ratings, temp_ratings])
    timestamps = np.hstack([timestamps, temp_timestamps])

print(max_num_songs, '\n')
print(f"user_ids: {user_ids.shape}\n")
print(f"item_ids: {item_ids.shape}\n")
print(f"ratings: {ratings.shape}\n")
print(f"timestamps: {timestamps.shape}\n")

HBox(children=(FloatProgress(value=0.0, max=138086.0), HTML(value='')))


200 

user_ids: (5707070,)

item_ids: (5707070,)

ratings: (5707070,)

timestamps: (5707070,)



In [17]:
# save spotligth seq data
np.save('./../data/user_ids_for_spotlight.npy', user_ids)
np.save('./../data/item_ids_for_spotlight.npy', item_ids)
np.save('./../data/ratings_for_spotlight.npy', ratings)
np.save('./../data/timestamps_for_spotlight.npy', timestamps)

In [34]:
def _index_or_none(array, shuffle_index):

    if array is None:
        return None
    else:
        return array[shuffle_index]

In [35]:
from spotlight.interactions import Interactions

train_index = plylst_use[plylst_use['istrain'] == 1].index.to_numpy()
test_index  = plylst_use[plylst_use['istrain'] == 0].index.to_numpy()

dataset = Interactions(user_ids=user_ids, item_ids=item_ids, ratings=ratings, timestamps=timestamps)

train = Interactions(dataset.user_ids[train_index],
                     dataset.item_ids[train_index],
                     ratings=_index_or_none(dataset.ratings, train_index),
                     timestamps=_index_or_none(dataset.timestamps, train_index),
                     num_users=dataset.num_users,
                     num_items=dataset.num_items)

test  = Interactions(dataset.user_ids[test_index],
                     dataset.item_ids[test_index],
                     ratings=_index_or_none(dataset.ratings, test_index),
                     timestamps=_index_or_none(dataset.timestamps, test_index),
                     num_users=dataset.num_users,
                     num_items=dataset.num_items)

In [36]:
train

<Interactions dataset (138086 users x 638337 items x 115071 interactions)>

In [37]:
test

<Interactions dataset (138086 users x 638337 items x 23015 interactions)>