In [3]:
import copy
import random
import numpy as np
import pandas as pd
import time
from collections import Counter
import torch
from util.arena_util import load_json
from util.arena_util import write_json
import json
# PyTorch imports
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
from models import *
from training import Trainer

torch.manual_seed(0)
# Workspace imports
#from evaluate import evaluate_model
#from utils import train_one_epoch, test, plot_statistics

# Python imports
import argparse
from time import time
import numpy as np
import pickle

# Data Loading


In [4]:
def _split_data(playlists):
    tot = len(playlists)
    train = playlists[:int(tot*0.80)]
    val = playlists[int(tot*0.80):]

    return train, val
def _mask(playlists, mask_cols, del_cols):
    q_pl = copy.deepcopy(playlists)
    a_pl = copy.deepcopy(playlists)

    for i in range(len(playlists)):
        for del_col in del_cols:
            q_pl[i][del_col] = []
            if del_col == 'songs':
                a_pl[i][del_col] = a_pl[i][del_col][:100]
            elif del_col == 'tags':
                a_pl[i][del_col] = a_pl[i][del_col][:10]

        for col in mask_cols:
            mask_len = len(playlists[i][col])
            mask = np.full(mask_len, False)
            mask[:mask_len//2] = True
            np.random.shuffle(mask)

            q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
            a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

    return q_pl, a_pl

def _mask_data(playlists):
    playlists = copy.deepcopy(playlists)
    tot = len(playlists)
    # song_only = playlists[:int(tot * 0.3)]
    # song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
    # tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
    # title_only = playlists[int(tot * 0.95):]
    song_only = playlists[:int(tot * 0.4)]
    song_and_tags = playlists[int(tot * 0.4):]

    # print(f"Total: {len(playlists)}, "
    #         f"Song only: {len(song_only)}, "
    #         f"Song & Tags: {len(song_and_tags)}, "
    #         f"Tags only: {len(tags_only)}, "
    #         f"Title only: {len(title_only)}")

    print(f"Total: {len(playlists)}, "
            f"Song only: {len(song_only)}, "
            f"Song & Tags: {len(song_and_tags)}"
            )

    song_q, song_a = _mask(song_only, ['songs'], ['tags'])
    songtag_q, songtag_a = _mask(song_and_tags, ['songs', 'tags'], [])
    # tag_q, tag_a = _mask(tags_only, ['tags'], ['songs'])
    # title_q, title_a = _mask(title_only, [], ['songs', 'tags'])

    q = song_q + songtag_q #+ tag_q + title_q
    a = song_a + songtag_a #+ tag_a + title_a

    shuffle_indices = np.arange(len(q))
    np.random.shuffle(shuffle_indices)

    q = list(np.array(q)[shuffle_indices])
    a = list(np.array(a)[shuffle_indices])

    return q, a

In [5]:
##train = pd.read_json('../file/train.json', encoding='utf-8')
#song_meta = pd.read_json('../file/song_meta.json', encoding='utf-8')


In [6]:
random.seed(777)
fname = '../file/train.json'
print("Reading data...\n")
playlists = load_json(fname)
random.shuffle(playlists)
print(f"Total playlists: {len(playlists)}")

# print("Splitting data...")
# train, val = _split_data(playlists)

Reading data...

Total playlists: 115071


In [7]:
train_q, train_a = _mask_data(playlists)
print(len(train_q),len(train_a))

Total: 115071, Song only: 46028, Song & Tags: 69043
115071 115071


In [8]:
train_q_ = train_q[:1000]
train_a_ = train_a[:1000]

In [312]:
print("Masked...")
write_json(train_q, "train_q_new.json")
write_json(train_a, "train_a_new.json")

Masked...


In [313]:
title = [trn['plylst_title'] for trn in train_q_]
tags = [trn['tags'] for trn in train_q_]
songs = [list(map(str, trn['songs'])) for trn in train_q_]

In [314]:
from mecab import Tagger

In [315]:
title_pos = list(map(lambda e: list(zip(*Tagger().parse(e)))[0], title))

In [316]:
import re

In [317]:
pttn = re.compile('[~!@#$%^&*_■♪:.,/?!]')

In [400]:
def pos_tokenizer(sen):
    try:
        sen_pos = list(map(lambda e: list(zip(*Tagger().parse(e)))[0], sen))
        sen_pos = [list(filter(lambda e: len(pttn.sub('',e))>1, t)) for t in sen_pos]
        return sen_pos
    except IndexError:
        return []

In [405]:
title_pos = [list(filter(lambda e: len(pttn.sub('',e))>1, t)) for t in title_pos]

In [381]:
from gensim.models import Word2Vec

In [382]:
songs = [list(map(str, song)) for song in songs]

In [383]:
use_songs = get_most_common(songs, 1000)

In [384]:
def get_most_common(nested_list, n_most):
    common = Counter([__ for _ in nested_list for __ in _])
    return list(map(str, list(zip(*common.most_common(n_most)))[0]))

In [385]:
def get_selected_list(nested_list, use_set):
    return [list(filter(lambda e: e in use_set, _)) for _ in nested_list]

In [386]:
def get_dict(use_set):
    idx_word = dict(enumerate(use_set))
    word_idx = {value:str(key) for key, value in idx_word.items()}
    return word_idx, idx_word

In [387]:
def get_idx_list(nested_list, word_idx):
    return [list(map(lambda e: word_idx[e], _)) for _ in nested_list]

In [425]:
def preprocess(nested_list, n_most):
    use_set = get_most_common(nested_list, n_most)
    filtered_nested_list = get_selected_list(nested_list, use_set)
    word_idx, idx_word = get_dict(use_set)
    return get_idx_list(filtered_nested_list, word_idx), word_idx, idx_word

In [426]:
songs_, word_idx_song, idx_word_song = preprocess(songs, 1000)
tags_, word_idx_tag, idx_word_tag = preprocess(tags, 100)
titles_, word_idx_title, idx_word_title = preprocess(title_pos, 100)

In [427]:
songs_ = [s if s else [str(len(word_idx_song))] for s in songs_ ]

In [428]:
w2v_title = Word2Vec(titles_)
w2v_tag = Word2Vec(tags_)
w2v_song = Word2Vec(songs_, size=1000)

In [429]:
from keras.models import Sequential
from keras.layers import Dense, Activation

In [430]:
from sklearn.preprocessing import LabelEncoder

In [431]:
train_q_ = pd.DataFrame(train_q_)

In [432]:
train_q_['pos_title'] = title_pos

In [433]:
train_q_['song_idx'] = songs_

In [434]:
train_q_['tags_idx'] = tags_

In [435]:
train_q_['title_idx'] = titles_

In [436]:
train_q_['songs_idx_embed'] = 

Unnamed: 0,tags,id,plylst_title,songs,like_cnt,updt_date,pos_title,song_idx,tags_idx,title_idx
0,[],11043,결국 목소리다 – 시원하게 질러주는 성악 모음,"[688032, 512889, 480493, 291722, 565605, 55572...",67,2018-07-30 22:48:50.000,"[결국, 목소리, 시원, 질러, 성악, 모음]",[1000],[],"[72, 3]"
1,"[주말, 여유, 사색을위한클래식, 아이를위한클래식, 분위기있는클래식]",62865,편안한 명상을 위한 릴랙스요가클래식,"[623782, 172865, 186124, 652748, 495190, 37751...",376,2019-04-27 10:04:38.000,"[편안, 명상, 위한, 랙스, 요가, 클래식]","[890, 891, 892, 893, 894, 895, 896, 897, 898, ...","[52, 66]","[82, 26, 35]"
2,[],14741,"Super Brass, Jim & Friends","[130052, 11689, 516656, 327877, 79841, 319524]",3,2013-04-29 14:16:23.000,"[Super, Brass, Jim, Friends]",[1000],[],[]
3,"[추억, 발라드]",65882,너와 이별했을때 듣던 노래 그리고 숨겨진 이야기,"[651499, 593128, 555781, 651332, 153569, 44817...",0,2020-04-03 13:32:58.000,"[이별, 노래, 그리고, 숨겨진, 이야기]","[389, 18, 903, 390, 175, 904, 3, 905, 391, 392...","[9, 11]","[45, 0, 96]"
4,"[클래식, 성악, 바흐, 칸타타, 휴식힐링]",39812,"바로크 시대에 성행한 악곡형식, 칸타타 대표 작품들","[684783, 480909, 688156, 239226, 557658, 67557...",624,2019-04-16 08:38:13.000,"[바로크, 시대, 성행, 악곡, 형식, 칸타타, 대표, 작품]",[1000],[39],[]
...,...,...,...,...,...,...,...,...,...,...
995,[봄날],100383,설레이는 봄에 설레임을 증폭 시켜 줄 플레이리스트 :),"[313262, 365394, 418192, 409519, 206812, 33239...",5,2017-04-12 11:41:38.000,"[설레이, 설레임, 증폭, 시켜, 플레이, 리스트]","[889, 817, 871]",[79],"[20, 7]"
996,"[기분좋은, 연휴, 카페, 펍, 편안한]",12638,"늦은 저녁 카페 펍에서 듣는 리스트! 팝, 록, 알앤비 etc.","[135272, 304656, 169198, 49771, 358506, 479177...",243,2017-10-19 14:56:43.000,"[저녁, 카페, 에서, 리스트, etc]",[1000],[8],"[78, 16, 4, 7]"
997,"[겨울, 휴식, 카페, 기분전환]",132103,잔잔한 Vibe!!! 좋은 분위기 히팝~!!,"[325871, 652209, 282708, 347966, 662545, 18551...",9,2018-02-05 22:48:41.000,"[잔잔, Vibe, 분위기]","[39, 811]","[31, 3, 8, 1]","[14, 13]"
998,[],6200,봄에 듣기 좋은 노래,"[455668, 122363, 549178, 407828, 343974, 8719,...",1,2019-04-23 15:22:30.000,[노래],"[784, 7, 36, 0, 68, 19, 177, 70, 708]",[],[0]


In [None]:
trian_q_.pos_title

In [370]:
for train in train_q_:
    tags_tr, title_tr, songs_tr = train['tags'], train['plylst_title'], train['songs']
    print(train)
    tag_vec = np.mean([w2v_tag.wv[word_idx_tag[__]] 
                       for _ in tags_tr for __ in _
                       if __ in w2v_tag])
    title_vec = np.mean([w2v_title.wv[word_idx_title[__]] 
                         for _ in title_tr for __ in _
                        if _ in w2v_title])
    song_vec = np.mean([[w2v_song.wv[word_idx_song[__]] 
                         for _ in songs_tr 
]])
    print(tag_vec, title_vec, song_vec)

{'tags': [], 'id': 11043, 'plylst_title': '결국 목소리다 – 시원하게 질러주는 성악 모음', 'songs': [688032, 512889, 480493, 291722, 565605, 555723, 680174, 13106, 301396, 145012, 595590, 198198, 80053, 268090, 23427], 'like_cnt': 67, 'updt_date': '2018-07-30 22:48:50.000'}


  if __name__ == '__main__':


KeyError: 688032

In [53]:
# len(use_song)

In [54]:
# max([len(train['songs']) for train in train_q])

In [29]:
model = Sequential()
model.add(Dense(1000), input_shape=)
model.
model.summary()

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.

In [28]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy',
              metrics=['accuracy'])   

# Data Preprocessing 

In [4]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
#song_selected = [_[0] for _ in list(filter(lambda e: e[1] > 100, Counter(songs).items()))]
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_dict_100 = {x: song_counter[x] for x in song_counter if song_counter[x]>100}
song_dict_10_100 = {x: song_counter[x] for x in song_counter if 10 < song_counter[x] <= 100}
song_dict_10 = {x: song_counter[x] for x in song_counter if song_counter[x]>10}
song_dict_0_10 = {x: song_counter[x] for x in song_counter if song_counter[x] <= 10}
song_dict_20 = {x: song_counter[x] for x in song_counter if song_counter[x]>20}

song_id_sid = dict()
song_sid_id = dict()

song_100_id_sid = dict()
song_100_sid_id = dict()

song_10_id_sid = dict()
song_10_sid_id = dict()

song_20_id_sid = dict()
song_20_sid_id = dict()

song_10_100_id_sid = dict()
song_10_100_sid_id = dict()

song_0_10_id_sid = dict()
song_0_10_sid_id = dict()

for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

for i, t in enumerate(song_dict_100):
    song_100_id_sid[t] = i
    song_100_sid_id[i] = t

for i, t in enumerate(song_dict_10):
    song_10_id_sid[t] = i
    song_10_sid_id[i] = t

for i, t in enumerate(song_dict_20):
    song_20_id_sid[t] = i
    song_20_sid_id[i] = t

for i, t in enumerate(song_dict_10_100):
    song_10_100_id_sid[t] = i
    song_10_100_sid_id[i] = t

for i, t in enumerate(song_dict_0_10):
    song_0_10_id_sid[t] = i
    song_0_10_sid_id[i] = t


n_songs = len(song_dict)
n_songs_100 = len(song_dict_100)
n_songs_10 = len(song_dict_10)
n_songs_20 = len(song_dict_20)
n_songs_10_100 = len(song_dict_10_100)
n_songs_0_10 = len(song_dict_0_10)
n_plylst = len(plylst)

In [5]:
n_songs, n_songs_100, n_songs_10_100, n_songs_0_10

(615142, 8491, 66587, 540064)

In [6]:
plylst['songs_id_10'] = plylst['songs'].map(lambda x: [song_10_id_sid.get(s) for s in x if song_10_id_sid.get(s) != None])
plylst['songs_id_20'] = plylst['songs'].map(lambda x: [song_20_id_sid.get(s) for s in x if song_20_id_sid.get(s) != None])
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['songs_id_100'] = plylst['songs'].map(lambda x: [song_100_id_sid.get(s) for s in x if song_100_id_sid.get(s) != None])

plylst['songs_id_10_100'] = plylst['songs'].map(lambda x: [song_10_100_id_sid.get(s) for s in x if song_10_100_id_sid.get(s) != None])
plylst['songs_id_0_10'] = plylst['songs'].map(lambda x: [song_0_10_id_sid.get(s) for s in x if song_0_10_id_sid.get(s) != None])

plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

plylst['songs_id_hier'] = plylst['songs'].map(lambda x: [[0,song_100_id_sid.get(s)] if song_100_id_sid.get(s) != None else [1,song_10_100_id_sid.get(s)] if song_10_100_id_sid.get(s) != None else [2,song_0_10_id_sid.get(s)] for s in x ])


In [7]:
# train_list = plylst[plylst["songs_id_20"].apply(lambda a: len(a)>0)*plylst["songs_id_20"].apply(lambda a: len(a)>0)]["songs_id_20"].tolist()

# train_list_100 = plylst[plylst["songs_id_100"].apply(lambda a: len(a)>0)]["songs_id_100"].tolist()

# train_list_10_100 = plylst[plylst["songs_id_10_100"].apply(lambda a: len(a)>0)]["songs_id_10_100"].tolist()

# train_list_0_10 = plylst[plylst["songs_id_0_10"].apply(lambda a: len(a)>0)]["songs_id_0_10"].tolist()


In [8]:
train_list = plylst[plylst['songs_id_hier'].apply(lambda a: 0 in np.array(a)[:,0] and 1 in np.array(a)[:,0] and 2 in np.array(a)[:,0])]['songs_id_hier'].tolist()

train_list_a = plylst[plylst['songs_id_hier'].apply(lambda a: 0 in np.array(a)[:,0] and 1 in np.array(a)[:,0] and 2 in np.array(a)[:,0])]['songs_id'].tolist()

In [9]:
len(train_list), len(train_list_a)

(79409, 79409)

In [10]:
# print(n_songs_100)
n_tags = len(tag_dict)
print(n_tags, n_songs)
#print(len(train_list_100),len(train_list_10_100),len(train_list_0_10))

29160 615142


# Model

In [11]:
from models import *

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLP5(n_tags, n_songs_100, n_songs_10_100, n_songs_0_10, layers=[1548, 512, 256], dropout=False, use_cuda = True)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model.to(device)

MLP5(
  (song_embedding_100): Embedding(8491, 774)
  (song_embedding_10_100): Embedding(66587, 387)
  (song_embedding_0_10): Embedding(540064, 387)
  (fc_layers): ModuleList(
    (0): Linear(in_features=1548, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=256, bias=True)
  )
  (output_layer): Linear(in_features=256, out_features=615142, bias=True)
)

In [12]:
# #from models import MLP
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = '0'
# # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# model = MLP(n_tags, n_songs_20, layers=[4096, 2048, 6192], dropout=False, use_cuda = True)
# if torch.cuda.device_count() > 1:
#     model = nn.DataParallel(model)
# model.to(device)


In [12]:
checkpoint = torch.load("./output_model_epoch_0.pth")


In [13]:
model.load_state_dict(checkpoint['model_state_dict'])


<All keys matched successfully>

In [9]:
from models import *

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLP4(n_tags, n_songs_20, layers=[1024, 512, 256, 512, 256, 128, 256, 128, 256, 512, 256, 512, 256, 512, 256, 512, 256, 512, 1024, 2048, 4096], dropout=False, use_cuda = True)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model.to(device)

MLP4(
  (song_embedding): Embedding(42109, 1024)
  (fc_layers): ModuleList(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): Linear(in_features=128, out_features=256, bias=True)
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): Linear(in_features=128, out_features=256, bias=True)
    (8): Linear(in_features=256, out_features=512, bias=True)
    (9): Linear(in_features=512, out_features=256, bias=True)
    (10): Linear(in_features=256, out_features=512, bias=True)
    (11): Linear(in_features=512, out_features=256, bias=True)
    (12): Linear(in_features=256, out_features=512, bias=True)
    (13): Linear(in_features=512, out_features=256, bias=True)
    (14): Linear(in_features=256, out_features=512,

In [25]:
#from models import MLP1
from models import *

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = MLP2(n_tags, n_songs_20, layers=[1024, 512, 256, 512, 256, 128, 256, 128, 256, 512, 256, 512, 256, 512, 256, 512, 256, 512, 1024, 2048], dropout=False, use_cuda = True)
if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model)
model.to(device)

MLP2(
  (song_embedding): Embedding(42109, 1024)
  (fc_layers): ModuleList(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): Linear(in_features=512, out_features=256, bias=True)
    (2): Linear(in_features=256, out_features=512, bias=True)
    (3): Linear(in_features=512, out_features=256, bias=True)
    (4): Linear(in_features=256, out_features=128, bias=True)
    (5): Linear(in_features=128, out_features=256, bias=True)
    (6): Linear(in_features=256, out_features=128, bias=True)
    (7): Linear(in_features=128, out_features=256, bias=True)
    (8): Linear(in_features=256, out_features=512, bias=True)
    (9): Linear(in_features=512, out_features=256, bias=True)
    (10): Linear(in_features=256, out_features=512, bias=True)
    (11): Linear(in_features=512, out_features=256, bias=True)
    (12): Linear(in_features=256, out_features=512, bias=True)
    (13): Linear(in_features=512, out_features=256, bias=True)
    (14): Linear(in_features=256, out_features=512,

In [12]:
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [13]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
# Dataset 상속
class CustomDataset(Dataset): 
  def __init__(self, xinput, yinput):
    self.x_data = xinput
    self.y_data = yinput
    # self.batch_size = batchsize

  # 총 데이터의 개수를 리턴
  def __len__(self): 
    return len(self.x_data)

  # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
  def __getitem__(self, idx): 
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # x = torch.LongTensor(self.x_data[idx]).to(device)
    # y = torch.LongTensor(self.y_data[idx]).to(device)
    x = torch.LongTensor(self.x_data[idx])
    y = torch.LongTensor(self.y_data[idx])
    return {"input": x, "label": y}




In [14]:
def make_batch(samples):
    inputs = [sample['input'] for sample in samples]
    inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True, padding_value = n_songs_20)
    labels = [sample['label'] for sample in samples]
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value = n_songs_20)
    # padded_inputs = torch.nn.utils.rnn.pad_sequence(inputs, batch_first=True)
    return {'input': inputs,
            'label': labels}

In [15]:
input = CustomDataset(train_list, train_list_a)
dataloader = DataLoader(input, batch_size=1, collate_fn=make_batch, shuffle=True)

In [18]:
trainer = Trainer(model, optimizer, n_songs_100 + n_songs_10_100 + n_songs_0_10,
                 print_loss_every=100,
                 record_loss_every=5,
                 use_cuda = True)
trainer.train(dataloader, epochs=2)

RuntimeError: CUDA out of memory. Tried to allocate 798.00 MiB (GPU 0; 11.00 GiB total capacity; 9.37 GiB already allocated; 346.50 MiB free; 9.40 GiB reserved in total by PyTorch)

In [17]:
torch.save({'epoch': 0, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict()},"./output_model_epoch_0.pth")

In [18]:
val_json = load_json('../file/val.json')
val_json_1 = load_json("../file/results.json")

In [19]:
def remove_seen(seen, l):
    seen = set(seen)
    return [x for x in l if not (x in seen)]

In [20]:
for idx, plst in enumerate(val_json):
    if len(plst['songs']) !=0 :
        tmp=np.array(list(map(lambda s: [0,song_100_id_sid.get(s)] if song_100_id_sid.get(s) != None else [1,song_10_100_id_sid.get(s)] if song_10_100_id_sid.get(s) != None else [2,song_0_10_id_sid.get(s)] if song_0_10_id_sid.get(s) != None else [615142,615142], plst['songs'])))
        ipp = [torch.from_numpy(tmp[tmp!=[615142,615142]].reshape(-1,2)).to(dtype = torch.long, device = device)]
        #print(ipp.shape)
        output = model.predict(ipp)
        rec_song_idx = [song_sid_id[i] for i in output]
        rec_song_idx = remove_seen(plst['songs'],rec_song_idx)
        rec_song_idx = rec_song_idx[:100]
        val_json_1[idx]['songs']=rec_song_idx
    if idx % 500 == 0 :
        print(idx)


0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000


In [22]:
output

array([375205, 403509, 302814, 577464, 110804, 120282,  53468, 429662,
        46894, 187353,  84157,  24355, 313185, 104931, 451993, 307118,
       182369, 450387, 377740, 595349, 543041,  58227, 471478, 605504,
        28907, 406451, 529940, 524900, 106554, 397420,  37860, 146677,
       114593, 126908, 581766, 502784, 291339, 423164,  85924, 152538,
       409752, 226754,      0,  95480, 304912, 444864,  84648, 556605,
       387161, 431915, 566758, 136839, 212922,  37388, 187268, 557458,
       345563, 434584, 101344, 375776, 213192, 610982, 548901, 598822,
       235126, 466859, 145443, 366366, 449636, 281114, 514351, 348241,
       588003, 289977, 450049, 510930, 178832, 367905, 372123, 279450,
       523010, 556609, 594090, 520071,  79168, 515996, 452334, 514259,
       242194, 475587, 167449, 584090, 273474,  34799, 474495,      1,
       197197, 288069, 127896, 593605, 359027, 445257, 419149, 503400,
       214592, 322273, 568653, 597565, 461940, 297482, 385187, 220822,
      

In [23]:
ipp

[tensor([[    0,  4299],
         [    0,  1498],
         [    0,  3404],
         [    0,  5061],
         [    0,  2919],
         [    0,  1506],
         [    0,  1518],
         [    0,  2252],
         [    0,  2870],
         [    1, 18259],
         [    1, 20741],
         [    0,  1513],
         [    0,  1514],
         [    1, 28290],
         [    0,   522],
         [    0,  6394],
         [    0,  7331],
         [    0,  1270],
         [    0,  1497],
         [    0,  7686]], device='cuda:0')]

In [21]:
write_json(val_json_1, "./year_genre_onehot/results.json")

# test

In [20]:
for x in dataloader:
    data, label = x['input'], x['label']
    print(data)
    print(len(data))
    print("=======")
    print(label)
    break   

tensor([[[     1,  17608],
         [     0,   5097],
         [     2,   2474],
         [     2, 480139],
         [     2, 390715],
         [     2, 480140],
         [     2, 480141],
         [     2, 480142],
         [     2, 480143],
         [     1,  33522],
         [     2, 480144],
         [     2, 480145],
         [     2, 480146],
         [     2, 411721]]])
1
tensor([[ 39421,  18188,   9378, 555217, 465793, 555218, 555219, 555220, 555221,
          76691, 555222, 555223, 555224, 486799]])


In [None]:
for idx, plst in enumerate(val_json):
    if len(plst['songs']) !=0 :
        tmp=np.array(list(map(lambda s: [0,song_100_id_sid.get(s)] if song_100_id_sid.get(s) != None else [1,song_10_100_id_sid.get(s)] if song_10_100_id_sid.get(s) != None else [2,song_0_10_id_sid.get(s)] if song_0_10_id_sid.get(s) != None else [615142,615142], plst['songs'])))
        torch.from_numpy(tmp[tmp!=[615142,615142]].reshape(-1,2)).to(dtype = torch.long, device = device)
    if idx == 5:
        break


In [None]:
for idx, plst in enumerate(val_json):
    if len(plst['songs']) !=0 :
        ipp = np.array([[song_20_id_sid.get(i) for i in val_json[idx]['songs'] if song_20_id_sid.get(i) != None]])
        output = model.predict(ipp)
        output = output[np.isin(output, ipp[0]) == False][:100]
        rec_song_idx = [song_20_sid_id[i] for i in output]
        val_json_1[idx]['songs']=rec_song_idx
    if idx % 500 == 0 :
        print(idx)

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
# cudnn.benchmark = True


def main():

    args = parse_args()
    path = args.path
    dataset = args.dataset
    layers = eval(args.layers)
    weight_decay = args.weight_decay
    num_negatives_train = args.num_neg_train
    num_negatives_test = args.num_neg_test
    dropout = args.dropout
    learner = args.learner
    learning_rate = args.lr
    batch_size = args.batch_size
    epochs = args.epochs
    verbose = args.verbose

    topK = 100
    print("MLP arguments: %s " % (args))
    # model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())

    # Load data

    t1 = time()
    full_dataset = CustomDataset(train_q_list, train_a_list)

    train_data_q, train_data_a = full_dataset.x_data, full_dataset.y_data
    num_data = len(full_dataset)

    print("Load data done [%.1f s]. #user=%d, #item=%d"
          % (time()-t1, num_data, n_songs_100))
    
    #dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    training_data_generator = DataLoader(
        full_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Build model
    model = MLP(num_data, n_songs_100, layers=layers, dropout=dropout)
    # Transfer the model to GPU, if one is available
    model.to(device)
    if verbose:
        print(model)

    loss_fn = torch.nn.BCELoss()
    # Use Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

    # Record performance
    hr_list = []
    ndcg_list = []
    BCE_loss_list = []

    # Check Init performance
    hr, ndcg = test(model, full_dataset, topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    BCE_loss_list.append(1)
    # do the epochs now

    for epoch in range(epochs):
        epoch_loss = train_one_epoch( model, training_data_generator, loss_fn, optimizer, epoch, device)

        if epoch % verbose == 0:
            hr, ndcg = test(model, full_dataset, topK)
            hr_list.append(hr)
            ndcg_list.append(ndcg)
            BCE_loss_list.append(epoch_loss)
            # if hr > best_hr:
            #     best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            #     if args.out > 0:
            #         model.save(model_out_file, overwrite=True)
    print("hr for epochs: ", hr_list)
    print("ndcg for epochs: ", ndcg_list)
    print("loss for epochs: ", BCE_loss_list)
    # plot_statistics(hr_list, ndcg_list, BCE_loss_list,model.get_alias(), "./figs")
    # with open("metrics", 'wb') as fp:
    #     pickle.dump(hr_list, fp)
    #     pickle.dump(ndcg_list, fp)

    best_iter = np.argmax(np.array(hr_list))
    best_hr = hr_list[best_iter]
    best_ndcg = ndcg_list[best_iter]
    print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %
          (best_iter, best_hr, best_ndcg))
    # if args.out > 0:
    #     print("The best MLP model is saved to %s" %(model_out_file))


if __name__ == "__main__":
    print("Device available: {}".format(device))
    main()