# 파일 입출력 설정
- json 파일을 읽고 저장하도록 세팅

In [25]:
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./" + parent)
    with io.open("./" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

# 채점 모듈 세팅
- ndcg 를 구현합니다
- 향후 다른 방식을 쓰더라도 적용이 가능할 것으로 보입니다.

In [103]:
import numpy as np

# from arena_util import load_json


class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))

    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]
        
    def _ndcg(self, gt, rec):
        print(len(gt))
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0

        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])
        
        print(f"music_ndcg {music_ndcg}")
        print(f"tag_ndcg {tag_ndcg}")
        
        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
        print(f"Music nDCG: {music_ndcg:.6}")
        print(f"Tag nDCG: {tag_ndcg:.6}")
        print(f"Score: {score:.6}")
#         try:
#             music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
#             print(f"Music nDCG: {music_ndcg:.6}")
#             print(f"Tag nDCG: {tag_ndcg:.6}")
#             print(f"Score: {score:.6}")
#         except Exception as e:
#             print(e)

In [35]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as spr
from collections import Counter

song_meta = pd.read_json("./../data/song_meta.json")
train = pd.read_json("./../data/train.json")
test = pd.read_json("./../data/val.json")

# 데이터셋 생성
- train data, test data를 concat 하여 하나의 큰 데이터를 만들고 간단한 전처리 진행합니다.
- 밑에 train test split 하고 학습은 train 만 씁니다.

In [37]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

# index 부여
- 기존에 부여된 ID를 관리하기 쉽도록 인덱스로 바꿔줍니다.
- tag와 song 각각 따로 관리합니다.

In [47]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

n_songs = len(song_dict)

In [48]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [49]:
plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]

# 곡의 개수와 태그의 개수를 할당
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

In [50]:
# train test split
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [51]:
# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T = train_tags_A.T.tocsr()

In [53]:
train_songs_A_T

<638336x115071 sparse matrix of type '<class 'numpy.int64'>'
	with 5285871 stored elements in Compressed Sparse Row format>

In [81]:
# 몇 개만 테스트 해볼때

# np.random.seed(33)
# n_sample = 1000

# test = plylst_test.iloc[np.random.choice(range(n_test), n_sample, replace=False),:]
# test.index

Int64Index([124803, 122133, 126073, 134537, 117942, 133307, 134839, 124076,
            120998, 116400,
            ...
            131962, 137797, 121740, 131261, 133269, 115962, 137793, 121440,
            116366, 121572],
           dtype='int64', name='nid', length=1000)

In [120]:
# 실제적으로 학습 돌리고 추천해주는 부분
# 오래 걸립니다... ALS 등으로 대체가 가능할 것 같습니다. 이 함수는 이 과정을 직접 구현해서 조금 느리지 않나 생각이 드네용!
# 이 방법은 추후 테스트 해서 보고드리겠습니다.
# 방법 자체는 지난번 미팅 때 말씀드린 것처럼 collective matrix factorizaion 방식입니다. (tag, song 정보 함꼐 이용)

# from tqdm import tqdm

def rec(pids):
    tt = 1
    res = []

    for pid in pids:
        p = np.zeros((n_songs,1))
        p[plylst_test.loc[pid,'songs_id']] = 1

        val = train_songs_A.dot(p).reshape(-1)
        
        # 예측에 기반이 되는 데이터는 채점에 들어가지 않습니다. (test 데이터로 주어진 값들)
        songs_already = plylst_test.loc[pid, "songs_id"]
        tags_already = plylst_test.loc[pid, "tags_id"]

        cand_song = train_songs_A_T.dot(val)
        cand_song_idx = cand_song.reshape(-1).argsort()[-150:][::-1]

        cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
        rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

        cand_tag = train_tags_A_T.dot(val)
        cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

        cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
        rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

        res.append({
                    "id": plylst_nid_id[pid],
                    "songs": rec_song_idx,
                    "tags": rec_tag_idx
                })

        if tt % 1000 == 0:
            print(tt)

        tt += 1

    return res

In [121]:
answers = rec(plylst_test.index)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000


In [122]:
len(answers)

23015

In [123]:
write_json(answers, "results.json")

In [124]:
evaluator = CustomEvaluator()
evaluator.evaluate("./../data/val.json", "results.json")

27
0


ZeroDivisionError: float division by zero