In [7]:
%python most_popular.py run --train_fname=data/train.json --question_fname=data/val.json

UsageError: %%python is a cell magic, but the cell body is empty.


In [2]:
%pip install tqdm
%pip install gensim

Collecting gensim
  Downloading https://files.pythonhosted.org/packages/0b/66/04faeedb98bfa5f241d0399d0102456886179cabac0355475f23a2978847/gensim-3.8.3-cp37-cp37m-win_amd64.whl (24.2MB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading https://files.pythonhosted.org/packages/0b/8e/464b06f5efd26f2dc16ce7bd1662c2f31cadf9104fdbcbf5994674cc3a51/smart_open-2.1.0.tar.gz (116kB)
Collecting Cython==0.29.14 (from gensim)
  Downloading https://files.pythonhosted.org/packages/1f/be/b14be5c3ad1ff73096b518be1538282f053ec34faaca60a8753d975d7e93/Cython-0.29.14-cp37-cp37m-win_amd64.whl (1.7MB)
Collecting boto (from smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/23/10/c0b78c27298029e4454a472a1919bde20cb182dab1662cec7f2ca1dcc523/boto-2.49.0-py2.py3-none-any.whl (1.4MB)
Collecting boto3 (from smart-open>=1.8.1->gensim)
  Downloading https://files.pythonhosted.org/packages/3c/f4/41c1d8a69b07b2a087a7e552cbed21111ff36706fec2f1ba9983fba95771/boto3-1.14.20-py2.py3-none

In [18]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
from gensim.parsing.preprocessing import preprocess_string

In [25]:
# 목표 : 플레이리스트(노래, 태그, 플레이리스트 제목)이 주어지면, K개의 feature를 뽑도록 한다.
# 자연어 처리 모델을 구축
class PlaylistEmbedding:
    def __init__(self):
        # 파일에서 긁어온다.
        # 긁어올 파일 : train, val, song_meta
        self.train = pd.read_json('./data/train.json', encoding='UTF-8')
        self.val = pd.read_json('./data/val.json', encoding='UTF-8')
        self.song_meta = pd.read_json('./data/song_meta.json', encoding='UTF-8')
        self.most_results = pd.read_json('results.json', encoding='UTF-8')
        self.min_count = 10
        self.size = 100
        self.windows = 100
        self.sg = 1
        self.p2v_model = WordEmbeddingsKeyedVectors(self.size)
    
    #플레이리스트 : (플레이리스트 제목과 모든 송의 제목을 gensim으로 preprocess, 장르도 우겨넣자) 
    def get_dic(self, train, val, song_meta):
        song_dic = {} # plylst id - song id
        song_name_dic = {} # plylst id - [preprocessed song name]
        tag_dic = {} # plylst id - tag
        data = pd.concat([train, val])
        data = data.set_index('id')
        
        song_dic = data.to_dict('songs')
        tag_dic = data.to_dict('tags')
        data['song_name_token'] = data['songs'].map(lambda x : sum(list(map(lambda xx : preprocess_string(song_meta.loc[xx, 'song_name']), x)), []))
        song_name_dic = data.to_dict('song_name_token')
        
        '''
        for index, q in tqdm(data.iterrows()):
            song_name_dic[str(q['id'])] = sum(list(map(lambda x : preprocess_string(song_meta.loc[x]['song_name']), q['songs'])),[])
            song_dic[str(q['id'])] = q['songs']
            tag_dic[str(q['id'])] = q['tags']
        '''
        self.song_dic = song_dic
        self.song_name_dic = song_name_dic
        self.tag_dic = tag_dic
        total = data.apply(lambda x : song_name_dic[str(x['id'])] + tag_dic[str(x['id'])] + preprocess_string(x['plylst_title']), axis = 1)
        # total = [x for x in total if len(x)>1]
        self.total = total   
    
    # word2vec 모델
    def get_w2v(self):
        w2v_model = Word2Vec(self.total, min_count = self.min_count, size = self.size, window = self.windows, sg = self.sg)
        self.w2v_model = w2v_model
    
    # word2vec을 적합하고 embedded vector를 return
    def update_p2v(self, train, val,w2v_model):
        ID = []   
        vec = []
        embedd = {}
        for index, q in tqdm(pd.concat([train, val]).iterrows()):
            tmp_vec = 0
            for song_word in self.song_name_dic[str(q['id'])]:
                try:
                    tmp_vec += w2v_model.wv.get_vector(song_word)
                except KeyError:
                        pass
            for tag in self.tag_dic[str(q['id'])]:
                try:
                    tmp_vec += w2v_model.wv.get_vector(tag)
                except KeyError:
                        pass
            for title_word in preprocess_string(q['plylst_title']):
                #print(q['plylst_title'])
                try:
                    tmp_vec += w2v_model.wv.get_vector(title_word)
                except KeyError:
                        pass
                    
            if type(tmp_vec) != int:
                embedd[str(q['id'])] = tmp_vec
                ID.append(str(q['id']))    
                vec.append(tmp_vec)
                
        self.embedd = embedd
        self.p2v_model.add(ID, vec)
        return embedd
    
    def get_results(self):
        answers = []
        for index, q in tqdm(self.val.iterrows()):
            try:
                most_id = [x[0] for x in self.p2v_model.most_similar(str(q['id']), topn=200)]
                get_song = []
                get_tag = []
                for ID in most_id:
                    get_song += self.song_dic[ID]
                    get_tag += self.tag_dic[ID]
                get_song = list(pd.value_counts(get_song)[:200].index)
                get_tag = list(pd.value_counts(get_tag)[:20].index)
                answers.append({
                    "id": q["id"],
                    "songs": remove_seen(q["songs"], get_song)[:100],
                    "tags": remove_seen(q["tags"], get_tag)[:10],
                })
            except:
                answers.append({
                  "id": self.most_results.loc[index]["id"],
                  "songs": self.most_results.loc[index]['songs'],
                  "tags": self.most_results.loc[index]["tags"],
                }) 
                
        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs'])!=100:
                answers[n]['songs'] += remove_seen(q['songs'], self.most_results.loc[n]['songs'])[:100-len(q['songs'])]
            if len(q['tags'])!=10:
                answers[n]['tags'] += remove_seen(q['tags'], self.most_results.loc[n]['tags'])[:10-len(q['tags'])]  
        self.answers = answers

In [26]:
playlist = PlaylistEmbedding()

In [27]:
playlist.song_meta['song_name'][:10]

0                                             Feelings
1    Bach : Partita No. 4 In D Major, BWV 828 - II....
2                      Solsbury Hill (Remastered 2002)
3    Feeling Right (Everything Is Nice) (Feat. Popc...
4                                              그남자 그여자
5                                  Para Los Enamorados
6    Sibelius : Valse Triste Op.44 (시벨리우스 : 슬픈 왈츠 작...
7    Superman March (From &#34;Superman&#34; / Live...
8                        Lovers’ Leap (Feat. Qypthone)
9                                         사랑, 그대라는 멜로디
Name: song_name, dtype: object

In [30]:
playlist.train.loc[:10, 'songs'].map(lambda x : sum(list(map(lambda xx : preprocess_string(playlist.song_meta.loc[xx, 'song_name']), x)), []))

0     [hei, littl, girl, octagon, road, honeymoon, h...
1     [한사람을, audit, timerock, 기다리다, 둘이서, 바보에게, 바보가, ...
2     [도시의, alright, 좋아하니까, feat, amj, 그랬으면, feat, r...
3     [unknown, frozen, soundtrack, version, duet, 이...
4     [눈물에, 얼굴을, 묻는다, 학원별곡, 學園別曲, 떠나지마, doc와, blue, ...
5     [attent, feat, justin, bieber, quavo, chanc, r...
6     [feat, 서영은, half, moon, feat, 사랑하는, 사람에게, 고백하기...
7     [여자는, 윤은혜, 샐러드송, 샐러드기념일, 상심증후군, feat, daylight...
8     [별을따다, 사랑가, return, intro, 사랑가, rock, dead, ro...
9     [anthem, triumph, heart, despair, strive, fate...
10    [break, diva, hot, issu, 예삐오, abo, supa, dupa,...
Name: songs, dtype: object

In [11]:
playlist.get_dic(playlist.train, playlist.val, playlist.song_meta)
playlist.get_w2v()
playlist.update_p2v(playlist.train, playlist.val, playlist.w2v_model)

78738it [55:54, 23.47it/s]


KeyboardInterrupt: 

In [8]:
playlist.get_results()

23015it [04:39, 82.42it/s]


In [18]:
answers = playlist.answers
for index, row in enumerate(answers):
    if(len(row['songs']) != 100):
        answers[index]['songs']= list(set(row['songs'] + list(range(150 - len(row['songs'])))))[0:100]

In [22]:
write_json(answers, "wtf.json")

In [77]:
def dosomething(song_name_dic, train, tag_dic, val,w2v_model, pllist):
        ID = []   
        vec = []
        embedd = {}
        for index, q in tqdm(pd.concat([train, val]).iterrows()):
            tmp_vec = 0
            for song_word in song_name_dic[str(q['id'])]:
                try:
                    tmp_vec += w2v_model.wv.get_vector(song_word)
                except KeyError:
                        pass
            for tag in tag_dic[str(q['id'])]:
                try:
                    tmp_vec += w2v_model.wv.get_vector(tag)
                except KeyError:
                    pass
            for title_word in preprocess_string(q['plylst_title']):
                #print(q['plylst_title'])
                try:
                    tmp_vec += w2v_model.wv.get_vector(title_word)
                except KeyError:
                    pass
            if type(tmp_vec) != int:
                embedd[str(q['id'])] = tmp_vec
                ID.append(str(q['id']))    
                vec.append(tmp_vec)
                
        pllist.embedd = embedd
        pllist.p2v_model.add(ID, vec)
        return embedd

In [84]:
def dpresults(pllst, val, p2v_model, song_dic, tag_dic, most_results):
        answers = []
        for index, q in tqdm(val.iterrows()):
            try:
                most_id = [x[0] for x in p2v_model.most_similar(str(q['id']), topn=200)]
                get_song = []
                get_tag = []
                for ID in most_id:
                    get_song += song_dic[ID]
                    get_tag += tag_dic[ID]
                get_song = list(pd.value_counts(get_song)[:200].index)
                get_tag = list(pd.value_counts(get_tag)[:20].index)
                answers.append({
                    "id": q["id"],
                    "songs": remove_seen(q["songs"], get_song)[:100],
                    "tags": remove_seen(q["tags"], get_tag)[:10],
                })
            except:
                answers.append({
                  "id": most_results.loc[index]["id"],
                  "songs": most_results.loc[index]['songs'],
                  "tags": most_results.loc[index]["tags"],
                })
                
        # check and update answer
        for n, q in enumerate(answers):
            if len(q['songs'])!=100:
                answers[n]['songs'] += remove_seen(q['songs'], most_results.loc[n]['songs'])[:100-len(q['songs'])]
            if len(q['tags'])!=10:
                answers[n]['tags'] += remove_seen(q['tags'], most_results.loc[n]['tags'])[:10-len(q['tags'])]  
        pllst.answers = answers
        
        return answers