# 파일 입출력 설정

In [1]:
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./" + parent)
    with io.open("./" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

In [17]:
# 채점

import numpy as np

class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))
    
    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0
        
        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])
            
        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

In [2]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as spr
from collections import Counter
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from scipy.sparse import hstack, vstack
from tqdm.auto import tqdm

song_meta = pd.read_json("./../data/song_meta.json")
train = pd.read_json("./../data/train.json")
test = pd.read_json("./../data/val.json")

# 데이터셋 생성

In [3]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

# index 부여

In [4]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

n_songs = len(song_dict)

In [5]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]

# 곡의 개수와 태그의 개수를 할당
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
# train test split
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [7]:
# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_test), plylst_test['num_songs'])
col = [song for songs in plylst_test['songs_id'] for song in songs]
dat = np.repeat(1, plylst_test['num_songs'].sum())
test_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_songs))

row = np.repeat(range(n_test), plylst_test['num_tags'])
col = [tag for tags in plylst_test['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_test['num_tags'].sum())
test_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_tags))

In [8]:
train_merged_feature_csr = hstack([train_songs_A, train_tags_A])
test_merged_feature_csr  = hstack([test_songs_A, test_tags_A])

train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T  = train_tags_A.T.tocsr()

whole_datasets_csr       = vstack([train_merged_feature_csr, test_merged_feature_csr])
whole_datasets_csr_T     = whole_datasets_csr.T.tocsr()

In [10]:
whole_datasets_csr_T.shape

(668533, 138086)

In [11]:
# def rec(pids):
#     res = []

#     for pid in tqdm(pids):
#         p = np.zeros((n_songs,1))
#         p[plylst_test.loc[pid,'songs_id']] = 1

#         val = train_songs_A.dot(p).reshape(-1)

#         songs_already = plylst_test.loc[pid, "songs_id"]
#         tags_already = plylst_test.loc[pid, "tags_id"]

#         cand_song = train_songs_A_T.dot(val)
#         cand_song_idx = cand_song.reshape(-1).argsort()[-250:][::-1]

#         cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
#         rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

#         cand_tag = train_tags_A_T.dot(val)
#         cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

#         cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
#         rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

#         res.append({
#             "id": plylst_nid_id[pid],
#             "songs": rec_song_idx,
#             "tags": rec_tag_idx
#         })
        
#     return res

In [12]:
# res = rec(plylst_test.index)

HBox(children=(FloatProgress(value=0.0, max=23015.0), HTML(value='')))

KeyboardInterrupt: 

In [20]:
n_factor = 128

als_model = ALS(factors=n_factor, regularization=0.08)
als_model.fit(whole_datasets_csr_T * 15)

song_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)
song_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

song_model.item_factors = als_model.item_factors[:n_songs]
tag_model.item_factors  = als_model.item_factors[n_songs:n_songs + n_tags]

print(song_model.item_factors.shape)
print(tag_model.item_factors.shape)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


(638336, 128)
(30197, 128)


In [21]:
res = []

for u in tqdm(range(test_merged_feature_csr.shape[0])):
    songs_already = plylst_test.iloc[u]["songs_id"]
    song_rec = song_model.recommend(u, test_songs_A, N=200)

    song_rec = np.array([song_sid_id[x[0]] for x in song_rec])    
    song_rec = song_rec[np.isin(song_rec, songs_already) == False][:100]
    
    tags_already = plylst_test.iloc[u]["tags_id"]
    tag_rec = tag_model.recommend(u, test_tags_A, N=20)
    tag_rec = np.array([tag_tid_id[x[0]] for x in tag_rec])
    tag_rec = tag_rec[np.isin(tag_rec, tags_already) == False][:10]
    
    res.append({
            # train test의 vconcat 된 matrix에서 id를 추출하는 것이기 때문에 n_train 이후부터 test 데이터 셋임
            # 따라서 u + n_train이 각각의 test 셋에서의 id
            "id": plylst_nid_id[u + n_train],
            "songs": song_rec.tolist(),
            "tags": tag_rec.tolist()
        })

HBox(children=(FloatProgress(value=0.0, max=23015.0), HTML(value='')))




In [22]:
# 결과 저장
# os.mkdir("song_tag_recommend")
write_json(res, "./song_tag_recommend/results_als_128.json")

In [19]:
evaluator = CustomEvaluator()
evaluator.evaluate("./../best_performance_result/results.json", "./song_tag_recommend/results_als_64.json")

Music nDCG: 0.0290305
Tag nDCG: 0.193909
Score: 0.0537623


In [23]:
evaluator = CustomEvaluator()
evaluator.evaluate("./../best_performance_result/results.json", "./song_tag_recommend/results_als_128.json")

Music nDCG: 0.024141
Tag nDCG: 0.167606
Score: 0.0456607
