In [None]:
%pip install tqdm
%pip install gensim

In [1]:
import pandas as pd

from tqdm import tqdm

from arena_util import remove_seen
from arena_util import write_json

from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
from gensim.parsing.preprocessing import preprocess_string

### 데이터 불러오기

In [None]:
train = pd.read_json('./data/train.json', encoding='UTF-8')
val = pd.read_json('./data/val.json', encoding='UTF-8')
song_meta = pd.read_json('./data/song_meta.json', encoding='UTF-8')
most_results = pd.read_json('results.json', encoding='UTF-8')
min_count = 10
size = 100
windows = 100
sg = 1

In [3]:
p2v_model = WordEmbeddingsKeyedVectors(size)

In [24]:
# pandas apply 진행상황 보여주기 위한 모듈
tqdm.pandas()

### Dictionary 만들기

- song_name_dic  
    plylst id와 song name을 분해한 list를 매칭(plylst id - \[song1 분해, song2 분해, ... \])
- total(pandas.Series)  
    song name 분해한 것 + tag + plylst 제목 분해한 것

In [26]:
song_dic = {} # plylst id - song id
song_name_dic = {} # plylst id - [preprocessed song name]
tag_dic = {} # plylst id - tag
data = pd.concat([train, val])
data = data.set_index('id')

song_dic = data['songs'].to_dict()
tag_dic = data['tags'].to_dict()
data['song_name_token'] = data['songs'].progress_apply(lambda songs : sum([preprocess_string(song_meta.loc[song_id, 'song_name']) for song_id in songs], []))
song_name_dic = data['song_name_token'].to_dict()

100%|█████████████████████████████████| 138086/138086 [12:16<00:00, 187.38it/s]


In [27]:
data = data.reset_index()
total = data.progress_apply(lambda x : song_name_dic[x['id']] + tag_dic[x['id']] + preprocess_string(x['plylst_title']), axis = 1)

100%|████████████████████████████████| 138086/138086 [00:28<00:00, 4852.95it/s]


### Word2Vec Model Fitting

In [28]:
w2v_model = Word2Vec(total, min_count = min_count, size = size, window = windows, sg = sg)

### Calculate Embedding Vectors

In [31]:
ID = []   
vec = []
embedd = {}
for index, q in tqdm(pd.concat([train, val]).iterrows()):
    tmp_vec = 0
    for song_word in song_name_dic[q['id']]:
        try:
            tmp_vec += w2v_model.wv.get_vector(song_word) / len(song_name_dic[q['id']])
        except KeyError:
            pass
    for tag in tag_dic[q['id']]:
        try:
            tmp_vec += w2v_model.wv.get_vector(tag) / len(tag_dic[q['id']])
        except KeyError:
            pass
    for title_word in preprocess_string(q['plylst_title']):
        try:
            tmp_vec += 2 * w2v_model.wv.get_vector(title_word) / len(preprocess_string(q['plylst_title']))
        except KeyError:
            pass

    if type(tmp_vec) != int:
        embedd[str(q['id'])] = tmp_vec
        ID.append(str(q['id']))  
        vec.append(tmp_vec)

p2v_model.add(ID, vec)

138086it [09:14, 249.09it/s]


In [33]:
answers = []
for index, q in tqdm(val.iterrows()):
    try:
        most_id = [x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)]
        get_song = []
        get_tag = []
        for ID in most_id:
            get_song += song_dic[int(ID)]
            get_tag += tag_dic[int(ID)]
        get_song = list(pd.value_counts(get_song)[:200].index)
        get_tag = list(pd.value_counts(get_tag)[:20].index)
        answers.append({
            "id": q["id"],
            "songs": remove_seen(q["songs"], get_song)[:100],
            "tags": remove_seen(q["tags"], get_tag)[:10],
        })
    except:
        answers.append({
          "id": most_results.loc[index]["id"],
          "songs": most_results.loc[index]['songs'],
          "tags": most_results.loc[index]["tags"],
        }) 

# check and update answer
for n, q in enumerate(answers):
    if len(q['songs'])!=100:
        answers[n]['songs'] += remove_seen(q['songs'], most_results.loc[n]['songs'])[:100-len(q['songs'])]
    if len(q['tags'])!=10:
        answers[n]['tags'] += remove_seen(q['tags'], most_results.loc[n]['tags'])[:10-len(q['tags'])]

23015it [09:29, 40.38it/s]


In [39]:
write_json(answers, "results.json")

## Class로 만든 버전

In [None]:
# 목표 : 플레이리스트(노래, 태그, 플레이리스트 제목)이 주어지면, K개의 feature를 뽑도록 한다.
# 자연어 처리 모델을 구축
class PlaylistEmbedding:
    def __init__(self):
        # 파일에서 긁어온다.
        # 긁어올 파일 : train, val, song_meta
        self.train = pd.read_json('./data/train.json', encoding='UTF-8')
        self.val = pd.read_json('./data/val.json', encoding='UTF-8')
        self.song_meta = pd.read_json('./data/song_meta.json', encoding='UTF-8')
        self.most_results = pd.read_json('results.json', encoding='UTF-8')
        self.min_count = 10
        self.size = 100
        self.windows = 100
        self.sg = 1
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)
    
    #플레이리스트 : (플레이리스트 제목과 모든 송의 제목을 gensim으로 preprocess, 장르도 우겨넣자) 
    def get_dic(self, train, val, song_meta):
        song_dic = {} # plylst id - song id
        song_name_dic = {} # plylst id - [preprocessed song name]
        tag_dic = {} # plylst id - tag
        data = pd.concat([train, val])
        data = data.set_index('id')
        
        song_dic = data['songs'].to_dict()
        tag_dic = data['tags'].to_dict()
        data['song_name_token'] = data['songs'].map(lambda x : sum(list(map(lambda xx : preprocess_string(song_meta.loc[xx, 'song_name']), x)), []))
        song_name_dic = data['song_name_token'].to_dict()
        
        '''
        for index, q in tqdm(data.iterrows()):
            song_name_dic[str(q['id'])] = sum(list(map(lambda x : preprocess_string(song_meta.loc[x]['song_name']), q['songs'])),[])
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['tags']
        '''
        self.song_dic = song_dic
        self.song_name_dic = song_name_dic
        self.tag_dic = tag_dic
        
        data = data.reset_index()
        total = data.apply(lambda x : song_name_dic[x['id']] + tag_dic[x['id']] + preprocess_string(x['plylst_title']), axis = 1)
        # total = [x for x in total if len(x)>1]
        self.total = total
    
    # word2vec 모델
    def get_w2v(self):
        w2v_model = Word2Vec(self.total, min_count = self.min_count, size = self.size, window = self.windows, sg = self.sg)
        self.w2v_model = w2v_model
    
    # word2vec을 적합하고 embedded vector를 return
    def update_p2v(self, train, val,w2v_model):
        ID = []   
        vec = []
        embedd = {}
        for index, q in tqdm(pd.concat([train, val]).iterrows()):
            tmp_vec = 0
            for song_word in self.song_name_dic[str(q['id'])]:
                try:
                    tmp_vec += w2v_model.wv.get_vector(song_word)
                except KeyError:
                        pass
            for tag in self.tag_dic[str(q['id'])]:
                try:
                    tmp_vec += w2v_model.wv.get_vector(tag)
                except KeyError:
                        pass
            for title_word in preprocess_string(q['plylst_title']):
                #print(q['plylst_title'])
                try:
                    tmp_vec += w2v_model.wv.get_vector(title_word)
                except KeyError:
                        pass
                    
            if type(tmp_vec) != int:
                embedd[str(q['id'])] = tmp_vec
                ID.append(str(q['id']))    
                vec.append(tmp_vec)
                
        self.embedd = embedd
        self.p2v_model.add(ID, vec)
        return embedd
    
    def get_results(self):
        answers = []
        for index, q in tqdm(self.val.iterrows()):
            try:
                most_id = [x[0] for x in self.p2v_model.most_similar(str(q['id']), topn=200)]
                get_song = []
                get_tag = []
                for ID in most_id:
                    get_song += self.song_dic[int(ID)]
                    get_tag += self.tag_dic[int(ID)]
                get_song = list(pd.value_counts(get_song)[:200].index)
                get_tag = list(pd.value_counts(get_tag)[:20].index)
                answers.append({
                    "id": q["id"],
                    "songs": remove_seen(q["songs"], get_song)[:100],
                    "tags": remove_seen(q["tags"], get_tag)[:10],
                })
            except:
                answers.append({
                  "id": self.most_results.loc[index]["id"],
                  "songs": self.most_results.loc[index]['songs'],
                  "tags": self.most_results.loc[index]["tags"],
                }) 
                
        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs'])!=100:
                answers[n]['songs'] += remove_seen(q['songs'], self.most_results.loc[n]['songs'])[:100-len(q['songs'])]
            if len(q['tags'])!=10:
                answers[n]['tags'] += remove_seen(q['tags'], self.most_results.loc[n]['tags'])[:10-len(q['tags'])]  
        self.answers = answers

In [None]:
playlist = PlaylistEmbedding()

In [None]:
playlist.get_dic(playlist.train, playlist.val, playlist.song_meta)

In [None]:
playlist.get_w2v()

In [None]:
playlist.update_p2v(playlist.train, playlist.val, playlist.w2v_model)

In [None]:
playlist.get_results()

In [None]:
write_json(playlist.answers, "results.json")