In [41]:
import os
import json

import pandas as pd

from tqdm import tqdm
from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
from gensim.models import KeyedVectors

from collections import Counter
import sentencepiece as spm
import re

import distutils.dir_util
import io

In [12]:
def write_json(data, FILE_PATH, fname):
    def _conv(o):
        if isinstance(o, (np.int64, np.int32)):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath(FILE_PATH + parent)
    with io.open(FILE_PATH + fname, "w", encoding="utf-8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)

def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

 
def is_tag(word,tags_list_all):
    if word in tags_list_all :
        return True
    else:
        return False

def before_updt_date(cand_song_idx,updt_date,song_meta):
            
    updt_date = int(re.sub('-','',updt_date)[:8])
    return_idx = []
    
    for i in cand_song_idx:
        if int(song_meta.loc[i,'issue_date'])>updt_date :
            continue
        else:
            return_idx.append(i)

    return return_idx

In [None]:
class MakeAdditionalTags:
    def __init__(self,FILE_PATH):
        self.FILE_PATH = FILE_PATH
        with open(os.path.join(FILE_PATH, 'genre_gn_all.json'), encoding="utf-8") as f:
            self.genre_gn_all = json.load(f)
        self.res = []
            

    def mk_genre_tags (self,songs_input,song_meta, genre_gn_all, tag_num):
        genre_list=[]
        genre_tags_list=[]
        
        for i in songs_input:
            genre_list+=song_meta.loc[i,'song_gn_gnr_basket']
        c = Counter(genre_list)
        genre_list=c.most_common(tag_num)
        
        for i in range(len(genre_list)):
            try:genre_names = self.genre_gn_all[genre_list[i][0]].split('/')
            except KeyError : continue
            genre_tags_list += genre_names
            
        return genre_tags_list
    
    def sentence_piece (self,train):
        titles_train = train[train['plylst_title'] != '']['plylst_title']

        f = open('titles.txt', mode='wt', encoding='utf-8')
        for i in titles_train : 
            f.write(re.sub(r'[^가-힣a-zA-Z0-9\s]','',i)+'\n')

        for tags in train['tags']:
            for tag in tags:
                f.write(tag)
                f.write(' ')
            f.write('\n')

        f.close()

        templates = '--input=titles.txt \
        --model_prefix=train \
        --vocab_size=20000 \
        --character_coverage=1.0 \
        --model_type=bpe \
        '

        spm.SentencePieceTrainer.Train(templates)
    
    def mk_title_tags (self,data,tag_list_all):
        sp = spm.SentencePieceProcessor()
        sp.load('train.model')

        sp_title = []
        for i in tqdm(data['plylst_title']) :
            i = re.sub(r'[^가-힣a-zA-Z0-9\s]','',i)
            if type(i) != str : sp_title.append([])
            else:
                pieces = sp.encode_as_pieces(i)
                plus_tag = []
                for i in pieces:
                    tag = re.sub('▁','',i)

                    if (len(tag) > 1) and (tag[-1] == '과' or tag[-1] == '와'):
                        tag = tag[:-1]

                    if is_tag(tag,tag_list_all):
                        plus_tag.append(tag)
                sp_title.append(plus_tag)

        data['title_tags'] = sp_title
    
    def run (self,data,fname):
        data['genre_tags'] = data.apply(lambda x : self.mk_genre_tags(x['songs'],song_meta,self.genre_gn_all,2),axis=1)
        tags_list_all = []
        for tags in train['tags']:
            tags_list_all+=tags
        for tags in val['tags']:
            tags_list_all+=tags
        tags_list_all = list(set(tags_list_all))
        
        self.mk_title_tags(data,tags_list_all)
        

        #태그 추가한 val.json 데이터 새로 쓰기
        for pid in data.index:
            self.res.append({
                        "tags":data.loc[pid,"tags"],
                        "id": data.loc[pid, "id"],
                        "songs": data.loc[pid, "songs"],
                        "plylst_title": data.loc[pid, "plylst_title"],
                        "like_cnt": data.loc[pid, "like_cnt"],
                        "updt_date": data.loc[pid, "updt_date"],
                        "genre_tags": data.loc[pid, "genre_tags"],
                        "title_tags": data.loc[pid, "title_tags"]
                })

        write_json(self.res, self.FILE_PATH, fname+'_addtags.json')


In [None]:
class MakeBaselineResults:
    
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH
        
        with open(os.path.join(FILE_PATH, 'train_addtags.json'), encoding="utf-8") as f:
            self.train = pd.DataFrame(json.load(f))
        
        with open(os.path.join(FILE_PATH, 'val_addtags.json'), encoding="utf-8") as f:
            self.val = pd.DataFrame(json.load(f))
               
        with open(os.path.join(FILE_PATH, 'song_meta.json'), encoding="utf-8") as f:
            self.song_meta = pd.DataFrame(json.load(f))
        
        
    
    def intersect_cnt(self,tags,cand_tags,title_tags,genre_tags):
        
        tags_o = len(list(set(tags)&set(cand_tags))) * 3
        tags_t = len(list(set(tags)&set(title_tags))) * 2
        tags_g = len(list(set(tags)&set(genre_tags)))

        score = tags_o+tags_t+tags_g

        return score

    def mk_rec_lack_of_songs(self, x):
        t = pd.concat([self.train, self.val], ignore_index = True)
        t['cnt'] = 0

        tags_input = x.tags.copy()
        tags_genre = x.genre_tags.copy()
        tags_title = x.title_tags.copy()

        tag_score = t['tags'].apply(lambda x : self.intersect_cnt(x,tags_input,tags_title,tags_genre))
        t['cnt'] += tag_score


        t = t.sort_values(by='cnt',ascending=False)

        max_cnt = t.cnt.values[0]

        tag_result = []
        song_result = []

        while max_cnt > 0  :

            tl = list(t[t['cnt'] == max_cnt]['tags'])
            sl = list(t[t['cnt'] == max_cnt]['songs'])

            tc=Counter([item for sublist in tl for item in sublist]).most_common()
            sc=Counter([item for sublist in sl for item in sublist]).most_common()
            
            #before updt_date인지 체크
            cand_song = list(map(lambda x : x[0], sc))
            song_result += before_updt_date(cand_song,x.updt_date,self.song_meta)

            for i in tc:
                if (i[0] not in x['tags']) & (i[0] not in tag_result):
                    tag_result.append(i[0])
            
            song_result = remove_seen(x.songs, song_result)
            tag_result = remove_seen(x.tags,tag_result)
            
            if ((len(song_result) >= 100) & (len(tag_result) >= 10)) :
                break

            max_cnt -= 1

        return [tag_result[:10],song_result[:100]]
    
    def before_matrix_facotization(self,train,test):
        train['istrain'] = 1
        test['istrain'] = 0

        n_train = len(train)
        n_test = len(test)

        # train + test
        plylst = pd.concat([train, test], ignore_index=True)

        # playlist id
        plylst["nid"] = range(n_train + n_test)

        # id <-> nid    / 나중에 복구하기 위한 사전
        plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
        plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))
        
        # 태그와 노래별로 인덱스를 새로 부여. 이후 다시 찾기 위한 사전 구성
        # sparse matrix를 구성하기 위함.
        plylst_tag = plylst['tags']
        tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
        tag_dict = {x: tag_counter[x] for x in tag_counter}

        tag_id_tid = dict()
        tag_tid_id = dict()
        for i, t in enumerate(tag_dict):
            tag_id_tid[t] = i
            tag_tid_id[i] = t

        n_tags = len(tag_dict)

        plylst_song = plylst['songs']
        song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
        song_dict = {x: song_counter[x] for x in song_counter}

        song_id_sid = dict()
        song_sid_id = dict()
        for i, t in enumerate(song_dict):
            song_id_sid[t] = i
            song_sid_id[i] = t

        n_songs = len(song_dict)
        
        plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
        plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

        plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]
        plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
        plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
        plylst_use = plylst_use.set_index('nid')

        plylst_train = plylst_use.iloc[:n_train,:]
        plylst_test = plylst_use.iloc[n_train:,:]

        row = np.repeat(range(n_train), plylst_train['num_songs'])
        col = [song for songs in plylst_train['songs_id'] for song in songs]
        dat = np.repeat(1, plylst_train['num_songs'].sum())

        self.train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

        row = np.repeat(range(n_train), plylst_train['num_tags'])
        col = [tag for tags in plylst_train['tags_id'] for tag in tags]
        dat = np.repeat(1, plylst_train['num_tags'].sum())

        self.train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

        # compressed sparse row로 만들기
        self.train_songs_A_T = train_songs_A.T.tocsr()
        self.train_tags_A_T = train_tags_A.T.tocsr()
        
        #test이름
        test['songs_id'] = test['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
        test['tags_id'] = test['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
        
        return test

    def matrix_factorization_rec(self,pid):
        res= []
        for pid in tqdm(pids):

            p = np.zeros((n_songs,1))
            p[self.test.loc[pid,'songs_id']] = 1 #zero-vector에서 노래들의 인덱스를 1로 바꿈.

            val = self.train_songs_A.dot(p).reshape(-1) #내적하여 1차원벡터로

            # 노래 추천
            songs_already = self.test.loc[pid, "songs_id"]
            tags_already = self.test.loc[pid, "tags_id"]

            cand_song = self.train_songs_A_T.dot(val) #val을 다시 내적

            # 내림차순으로 "인덱스"를 정렬하고 역으로 상위 1000개 추출
            cand_song_idx = cand_song.reshape(-1).argsort()[-1000:][::-1]

            cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False]

            #plylst의 updt_date 이전 곡인가?
            rec_song_idx = before_updt_date2(cand_song_idx,self.test.loc[pid,'updt_date'],self.song_meta)


            # 태그 추천
            cand_tag = self.train_tags_A_T.dot(val)
            cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

            cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
            rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

            res.append({
                "id": self.test.loc[pid]['id'],
                "songs": rec_song_idx,
                "tags": rec_tag_idx
            })    
        return res
    
    def run(self):
        tqdm.pandas()
        
        #song 개수가 3개 미만일 경우, makc_rec_lack_of_songs 모듈로, song 개수가 3개 이상일 경우 matrix_factorization 모듈로
        
        self.test_enough = self.test[(self.test.songs.str.len()>=3)].copy()
        self.test_lack = self.test[(self.test.songs.str.len()<3)].copy()
        
        test_lack['rec'] = test_lack.progress_apply(lambda x : self.mk_rec_lack_of_songs(x),axis=1)
        test_lack['rec_tags'] = test_lack.apply(lambda x : x.rec[0],axis=1)
        test_lack['rec_songs'] = test_lack.apply(lambda x : x.rec[1],axis=1)
        test_lack = test_lack.drop(columns='rec')
        
        
        self.test_enough = before_matrix_facotization(self.train,self.test)
        res = self.matrix_factorization_rec(self.test_enough.index)
        self.test_enough['rec_tags'] = [i['tags'] for i in res]
        self.test_enough['rec_songs'] = [i['songs'] for i in res]
        
        
        self.answers = pd.concat([val_not, val_min]).sort_index(ascending=True)
        
        self.res = []
        for pid in self.answers.index:
            self.res.append({
                    "id": self.val.loc[pid, "id"],
                    "songs": self.val.loc[pid, "rec_songs"],
                    "tags": self.val.loc[pid, "rec_tags"]
            })
        
        write_json(self.res, self.FILE_PATH, "base_results.json")

In [1]:
class PlaylistEmbedding:
    def __init__(self, FILE_PATH):
        self.FILE_PATH = FILE_PATH
        self.min_count = 3
        self.size = 100
        self.window = 210
        self.sg = 5
        
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)
        
        with open(os.path.join(FILE_PATH, 'train.json'), encoding="utf-8") as f:
            self.train = json.load(f)
        with open(os.path.join(FILE_PATH, 'val.json'), encoding="utf-8") as f:
            self.val = json.load(f)
        with open(os.path.join(FILE_PATH, 'test.json'), encoding="utf-8") as f:
            self.test = json.load(f)  
        with open(os.path.join(FILE_PATH, 'results.json'), encoding="utf-8") as f:
            self.most_results = json.load(f)
    
    # 전체 데이터에서 곡과 태그를 사전형식으로 저장
    def get_dic(self, train, val, test):
        song_dic = {}
        tag_dic = {}
        train = train + val
        data = train + test

        for q in tqdm(data):
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['tags']
        self.song_dic = song_dic
        self.tag_dic = tag_dic
        
        # total = [['songs1'],['songs2'],['songs3'], ...['tags1'],['tags2'],[tags3]...]
        total = list(map(lambda x: list(map(str, x['songs'])) + list(x['tags']), data))
        total = [x for x in total if len(x)>1]
        self.total = total
        
    def get_w2v(self, total, min_count, size, window, sg):
        w2v_model = Word2Vec(total, min_count = min_count, size = size, window = window, sg = sg, workers = 4)
        self.w2v_model = w2v_model
        w2v_model.save("word2vec.model")
      
    # 플레이리스트의 벡터값을 산출
    def update_p2v(self, train, val, test, w2v_model):
        train = train + val
        ID = []   
        vec = []
        for q in tqdm(train + test):
            tmp_vec = 0
            if len(q['songs'])>=1:
                for song in q['songs'] + q['tags']:
                    try: 
                        tmp_vec += w2v_model.wv.get_vector(str(song))
                    except KeyError:
                        pass
            if type(tmp_vec)!=int:
                ID.append(str(q['id']))    
                vec.append(tmp_vec)
        self.p2v_model.add(ID, vec)
        self.p2v_model.save('p2v_model.model')
    
    # 가장 비슷한 플레이리스트의 노래와 태그를 추천
    def get_result(self, p2v_model, song_dic, tag_dic, most_results, test):
        answers = []
        for n, q in tqdm(enumerate(test), total = len(test)):
            try:
                most_id = [x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)]
                get_song = []
                get_tag = []
                for ID in most_id:
                    get_song += song_dic[ID]
                    get_tag += tag_dic[ID]
                get_song = list(pd.value_counts(get_song)[:200].index)
                get_tag = list(pd.value_counts(get_tag)[:20].index)
                answers.append({
                    "id": q["id"],
                    "songs": remove_seen(q["songs"], get_song)[:100],
                    "tags": remove_seen(q["tags"], get_tag)[:10],
                })
            except:
                answers.append({
                  "id": most_results[n]["id"],
                  "songs": most_results[n]['songs'],
                  "tags": most_results[n]["tags"],
                }) 
        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs'])!=100:
                answers[n]['songs'] += remove_seen(q['songs'], self.most_results[n]['songs'])[:100-len(q['songs'])]
            if len(q['tags'])!=10:
                answers[n]['tags'] += remove_seen(q['tags'], self.most_results[n]['tags'])[:10-len(q['tags'])]  
        self.answers = answers
    
    def run(self):
        self.get_dic(self.train, self.val,self.test)
        self.get_w2v(self.total, self.min_count, self.size, self.window, self.sg)
        self.update_p2v(self.train, self.val, self.test, self.w2v_model)
        self.get_result(self.p2v_model, self.song_dic, self.tag_dic, self.most_results, self.test)
        
        write_json(self.answers, 'results.json')

In [None]:
#MAIN
FILE_PATH = 'C:\\Users\\qorwl\\workingspace\\melon\\data\\' #train, val, test, song_meta, genre_gn_all이 저장되어 있는 디렉토리 경로로 변경

train_tags = MakeAdditionalTags(FILE_PATH)
train_tags.run(train,'train')
val_tags = MakeAdditionalTags(FILE_PATH)
val_tags.run(val,'val')
test_tags= MakeAdditionalTags(FILE_PATH)
test_tags.run(test,'test')

base = MakeBaselineResults(FILE_PATH)
results = base.run()

U_space = PlaylistEmbedding(FILE_PATH)
results = U_space.run()