In [1]:
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./" + parent)
    with io.open("./" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

In [2]:
# 채점

import numpy as np

class CustomEvaluator:
    def _idcg(self, l):
        return sum((1.0 / np.log(i + 2) for i in range(l)))
    
    def __init__(self):
        self._idcgs = [self._idcg(i) for i in range(101)]

    def _ndcg(self, gt, rec):
        dcg = 0.0
        for i, r in enumerate(rec):
            if r in gt:
                dcg += 1.0 / np.log(i + 2)

        return dcg / self._idcgs[len(gt)]

    def _eval(self, gt_fname, rec_fname):
        gt_playlists = load_json(gt_fname)
        gt_dict = {g["id"]: g for g in gt_playlists}
        rec_playlists = load_json(rec_fname)
        
        music_ndcg = 0.0
        tag_ndcg = 0.0
        
        for rec in rec_playlists:
            gt = gt_dict[rec["id"]]
            music_ndcg += self._ndcg(gt["songs"], rec["songs"][:100])
            tag_ndcg += self._ndcg(gt["tags"], rec["tags"][:10])
            
        music_ndcg = music_ndcg / len(rec_playlists)
        tag_ndcg = tag_ndcg / len(rec_playlists)
        score = music_ndcg * 0.85 + tag_ndcg * 0.15

        return music_ndcg, tag_ndcg, score

    def evaluate(self, gt_fname, rec_fname):
        try:
            music_ndcg, tag_ndcg, score = self._eval(gt_fname, rec_fname)
            print(f"Music nDCG: {music_ndcg:.6}")
            print(f"Tag nDCG: {tag_ndcg:.6}")
            print(f"Score: {score:.6}")
        except Exception as e:
            print(e)

In [4]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as spr
from collections import Counter
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from scipy.sparse import hstack, vstack
from tqdm.auto import tqdm

song_meta = pd.read_json("./../data/song_meta.json")
train = pd.read_json("./../data/train.json")
test = pd.read_json("./../data/val.json")

In [5]:
train.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000


In [6]:
genre_onehot = np.load("./../data/genre_onehot.npy")
# year_onehot = np.load("./../data/year_onehot.npy")

genre_onehot.shape

(138086, 254)

In [7]:
# genre_header = ['발라드', '발라드', "'80", "'90", "'00", "'10-", '댄스', '댄스', "'80", "'90", "'00", "'10-", '랩/힙합', '랩/힙합', '랩 스타일',
#  '보컬 스타일','언더그라운드 힙합','시대별','R&B/Soul','R&B/Soul''어반','R&B','인디음악','인디음악','포크','록','일렉','힙합','발라드',"'90","'00","'10-",
#  '록/메탈','록/메탈',"'70","'80","'90","'00","'10-",'성인가요','성인가요','신세대트로트','전설의트로트','뽕짝트로트','트로트메들리',"트로트'60-'70","트로트'80-'90",
#  "트로트'00-","성인가요'80-'90","성인가요'00-",'포크/블루스','포크/블루스',"'60-'70","'80-'90","'00","'10-",'POP','POP','얼터너티브팝','올디스','월드팝',
#  "'60-'70","'80-'90","'00","'10-",'록/메탈','록/메탈','모던록','얼터너티브록','프로그레시브/아트록','하드록','헤비메탈','뉴메탈/하드코어','포스트록',"'60","'70",
#  "'80","'90","'00","'10-",'일렉트로니카','일렉트로니카','일렉트로니카','하우스','클럽뮤직','일렉트로닉팝',"'80","'90","'00","'10-",'랩/힙합','랩/힙합','팝랩',
#  '얼터너티브힙합','갱스터/하드코어랩','East&West',"'80","'90","'00","'10-",'R&B/Soul','R&B/Soul','컨템포러리 R&B','소울','어반',"'60-'70","'80-'90",
#  "'00","'10-",'포크/블루스/컨트리','포크/블루스/컨트리','포크','블루스','컨트리',"'60-'70","'80-'90","'00-",'OST','OST','국내영화','국외영화','국내드라마',
#  '국외드라마','애니메이션/게임','국내뮤지컬','국외뮤지컬','애니메이션/웹툰','게임','클래식','클래식','관현악','교향곡','실내악','협주곡','독주곡','오페라','크로스오버',
#  '현대음악','성악/합창곡','발레/무용','지휘/연주자','컴필레이션','교향/관현악','오페라/성악','재즈','재즈','보컬재즈','애시드/퓨전/팝','Bop','보사노바', 'J-Jazz', '라틴재즈',
#  '빅밴드/스윙', '악기별', '컴필레이션', '뉴에이지', '뉴에이지', '이지리스닝', 'J-Newage', '기능성음악', '기타', '피아노', '컴필레이션', 'J-POP', 'J-POP', 'POP', '록', '일렉트로니카',
#  '랩/힙합', 'R&B/Soul', '시부야케이', '뉴에이지', '재즈', '힙합/R&B', "'80-'90", "'00-'10", '월드뮤직', '월드뮤직', '샹송/프렌치팝', '칸초네/이탈리안팝', '중국음악',
#  '켈틱/아이리시', '브라질', '탱고/플라멩코', '라틴', '레게', '파두', 'CCM', 'CCM', '국내CCM', '국외CCM', '워십', '찬송가', '성가', '연주곡', '어린이', '성경', 'NEW',
#  '어린이/태교', '어린이/태교', '창작동요', '영어동요', '영어동요', '영어동요', '영어동화', '만화', '자장가', '태교동화', '태교', '릴렉싱&힐링', '명작동화', '교과서동요',
#  '전래동요', '창작동화', '어린이클래식', '종교음악', '종교음악', '불교', '국악', '국악', '크로스오버', '국악가요', '민요', '판소리/단가', '풍물/사물놀이', '아이돌', '아이돌',
#  '남자 아이돌', '여자 아이돌', '랩/힙합', '발라드', '댄스', '일렉트로니카', '일렉트로니카', '댄스 스타일', '팝 스타일', '시대별', 'EDM', 'EDM', '하우스', '클럽',
#  'DJ', '뮤직테라피', '뮤직테라피', 'ASMR/자연', '힐링/명상/요가', '집중력', '숙면유도', '반려동물', '뮤지컬', '뮤지컬', '국내뮤지컬', '국외뮤지컬', '크리스마스']

# year_header = [0, 1900, 1919, 1927, 1931, 1936, 1939, 1941, 1945, 1946, 1947, 1949, 1950, 1951, 1952, 1953, 1954, 1955,
#  1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975,
#  1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995,
#  1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015,
#  2016, 2017, 2018, 2019, 2020, 2022]

genre_header = [ 'genre_' + str(i) for i in range(254) ]
# year_header  = [ 'year_' + str(i) for i in range(84) ]

genre_onehot = pd.DataFrame(genre_onehot, columns=genre_header).astype('int')
# year_onehot = pd.DataFrame(year_onehot, columns=year_header).astype('int')

genre_onehot.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,...,genre_244,genre_245,genre_246,genre_247,genre_248,genre_249,genre_250,genre_251,genre_252,genre_253
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

In [9]:
plylst.head()

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,istrain,nid
0,[락],61281,여행같은 음악,"[525514, 129701, 383374, 562083, 297861, 13954...",71,2013-12-19 18:36:19.000,1,0
1,"[추억, 회상]",10532,요즘 너 말야,"[432406, 675945, 497066, 120377, 389529, 24427...",1,2014-12-02 16:19:42.000,1,1
2,"[까페, 잔잔한]",76951,"편하게, 잔잔하게 들을 수 있는 곡.-","[83116, 276692, 166267, 186301, 354465, 256598...",17,2017-08-28 07:09:34.000,1,2
3,"[연말, 눈오는날, 캐럴, 분위기, 따듯한, 크리스마스캐럴, 겨울노래, 크리스마스,...",147456,크리스마스 분위기에 흠뻑 취하고 싶을때,"[394031, 195524, 540149, 287984, 440773, 10033...",33,2019-12-05 15:15:18.000,1,3
4,[댄스],27616,추억의 노래 ㅋ,"[159327, 553610, 5130, 645103, 294435, 100657,...",9,2011-10-25 13:54:56.000,1,4


In [10]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

#####
    
n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

n_songs = len(song_dict)

In [11]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]

# 곡의 개수와 태그의 개수를 할당
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [12]:
# train test split
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [13]:
# training set
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

train_genre_A = spr.csr_matrix((genre_onehot.loc[:n_train - 1]))
# train_year_A  = spr.csr_matrix((year_onehot.loc[:n_train - 1]))

# test set
row = np.repeat(range(n_test), plylst_test['num_songs'])
col = [song for songs in plylst_test['songs_id'] for song in songs]
dat = np.repeat(1, plylst_test['num_songs'].sum())
test_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_songs))

row = np.repeat(range(n_test), plylst_test['num_tags'])
col = [tag for tags in plylst_test['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_test['num_tags'].sum())
test_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_tags))

test_genre_A = spr.csr_matrix((genre_onehot.loc[n_train:]))
# test_year_A  = spr.csr_matrix((year_onehot.loc[n_train:]))

In [14]:
train_songs_A.shape

(115071, 638336)

In [15]:
train_genre_A.shape

(115071, 254)

In [30]:
train_merged_feature_csr = hstack([train_songs_A, train_tags_A, train_genre_A])
test_merged_feature_csr  = hstack([test_songs_A, test_tags_A, test_genre_A])

whole_datasets_csr       = vstack([train_merged_feature_csr, test_merged_feature_csr])
whole_datasets_csr_T     = whole_datasets_csr.T.tocsr()

In [24]:
whole_datasets_csr_T.shape

(668787, 138086)

In [25]:
# def rec(pids):
#     res = []

#     for pid in tqdm(pids):
#         p = np.zeros((n_songs,1))
#         p[plylst_test.loc[pid,'songs_id']] = 1

#         val = train_songs_A.dot(p).reshape(-1)

#         songs_already = plylst_test.loc[pid, "songs_id"]
#         tags_already = plylst_test.loc[pid, "tags_id"]

#         cand_song = train_songs_A_T.dot(val)
#         cand_song_idx = cand_song.reshape(-1).argsort()[-250:][::-1]

#         cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
#         rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

#         cand_tag = train_tags_A_T.dot(val)
#         cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

#         cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
#         rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

#         res.append({
#             "id": plylst_nid_id[pid],
#             "songs": rec_song_idx,
#             "tags": rec_tag_idx
#         })
        
#     return res

# res = rec(plylst_test.index)

In [26]:
n_factor = 20

als_model = ALS(factors=n_factor, regularization=0.08)
als_model.fit(whole_datasets_csr_T * 10)

song_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)

song_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

song_model.item_factors = als_model.item_factors[:n_songs]
tag_model.item_factors  = als_model.item_factors[n_songs:n_songs + n_tags]

print(song_model.item_factors.shape)
print(tag_model.item_factors.shape)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))


(638336, 20)
(30197, 20)


In [27]:
res = []

for u in tqdm(range(test_merged_feature_csr.shape[0])):
    songs_already = plylst_test.iloc[u]["songs_id"]
    song_rec = song_model.recommend(u, test_songs_A, N=200)

    song_rec = np.array([song_sid_id[x[0]] for x in song_rec])    
    song_rec = song_rec[np.isin(song_rec, songs_already) == False][:100]
    
    tags_already = plylst_test.iloc[u]["tags_id"]
    tag_rec = tag_model.recommend(u, test_tags_A, N=20)
    tag_rec = np.array([tag_tid_id[x[0]] for x in tag_rec])
    tag_rec = tag_rec[np.isin(tag_rec, tags_already) == False][:10]
    
    res.append({
            # train test의 vconcat 된 matrix에서 id를 추출하는 것이기 때문에 n_train 이후부터 test 데이터 셋임
            # 따라서 u + n_train이 각각의 test 셋에서의 id
            "id": plylst_nid_id[u + n_train],
            "songs": song_rec.tolist(),
            "tags": tag_rec.tolist()
        })

HBox(children=(FloatProgress(value=0.0, max=23015.0), HTML(value='')))




In [28]:
# os.mkdir("./genre_onehot")
# write_json(res, "./genre_onehot/results_als_32.json")
write_json(res, "./genre_onehot/results_als_20.json")

In [29]:
evaluator = CustomEvaluator()
evaluator.evaluate("./../best_performance_result/results.json", "./genre_onehot/results_als_20.json")

Music nDCG: 0.0453949
Tag nDCG: 0.254587
Score: 0.0767737
