# 파일 입출력 설정

In [None]:
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./" + parent)
    with io.open("./" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

In [None]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as spr
from collections import Counter

song_meta = pd.read_json("./../data/song_meta.json")
train = pd.read_json("./../data/train.json")
test = pd.read_json("./../data/val.json")

# 데이터셋 생성

In [None]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

# index 부여

In [None]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

n_songs = len(song_dict)

In [None]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]

# 곡의 개수와 태그의 개수를 할당
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

In [None]:
# train test split
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [None]:
# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_test), plylst_test['num_songs'])
col = [song for songs in plylst_test['songs_id'] for song in songs]
dat = np.repeat(1, plylst_test['num_songs'].sum())
test_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_songs))

row = np.repeat(range(n_test), plylst_test['num_tags'])
col = [tag for tags in plylst_test['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_test['num_tags'].sum())
test_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_tags))

In [None]:
train_merged_feature_csr = hstack([train_songs_A, train_tags_A])
test_merged_feature_csr  = hstack([test_songs_A, test_tags_A])

whole_datasets_csr       = vstack([train_merged_feature_csr, test_merged_feature_csr])
whole_datasets_csr_T     = whole_datasets_csr.T.tocsr()

In [None]:
whole_datasets_csr_T.toarray().shape

In [None]:
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from scipy.sparse import hstack, vstack

n_factor = 128
n_epoch  = 15.0

als_model = ALS(factors=n_factor, regularization=0.08)
als_model.fit(whole_datasets_csr_T * n_epoch)

song_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)
song_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

song_model.item_factors = als_model.item_factors[:n_songs]
tag_model.item_factors  = als_model.item_factors[n_songs:]

print(song_model.item_factors.shape)
print(tag_model.item_factors.shape)

In [None]:
res = []

from tqdm.auto import tqdm

for u in tqdm(range(test_merged_feature_csr.shape[0])):
    song_rec = song_model.recommend(u, test_songs_A, N=100)
    song_rec = [song_sid_id[x[0]] for x in test_songs_A]
    
    tag_rec = tag_model.recommend(u, test_tag_A, N=10)
    tag_rec = [tag_tid_id[x[0]] for x in test_tag_A]
    
    res.append({
            # train test의 vconcat 된 matrix에서 id를 추출하는 것이기 때문에 n_train 이후부터 test 데이터 셋임
            # 따라서 u + n_train이 각각의 test 셋에서의 id
            "id": plylst_nid_id[u + n_train],
            "songs": song_rec,
            "tags": tag_rec
        })

In [None]:
# 결과 저장
write_json(res, "results.json")