In [3]:
import copy
import random
import numpy as np
import pandas as pd
import time
from collections import Counter
import torch
from util.arena_util import load_json
from util.arena_util import write_json
import json
# PyTorch imports
import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn
from torch.utils.data import Dataset, DataLoader
from models import *
torch.manual_seed(0)
# Workspace imports
#from evaluate import evaluate_model
#from utils import train_one_epoch, test, plot_statistics

# Python imports
import argparse
from time import time
import numpy as np
import pickle

# Data Loading


In [4]:
def _split_data(playlists):
    tot = len(playlists)
    train = playlists[:int(tot*0.80)]
    val = playlists[int(tot*0.80):]

    return train, val
def _mask(playlists, mask_cols, del_cols):
    q_pl = copy.deepcopy(playlists)
    a_pl = copy.deepcopy(playlists)

    for i in range(len(playlists)):
        for del_col in del_cols:
            q_pl[i][del_col] = []
            if del_col == 'songs':
                a_pl[i][del_col] = a_pl[i][del_col][:100]
            elif del_col == 'tags':
                a_pl[i][del_col] = a_pl[i][del_col][:10]

        for col in mask_cols:
            mask_len = len(playlists[i][col])
            mask = np.full(mask_len, False)
            mask[:mask_len//2] = True
            np.random.shuffle(mask)

            q_pl[i][col] = list(np.array(q_pl[i][col])[mask])
            a_pl[i][col] = list(np.array(a_pl[i][col])[np.invert(mask)])

    return q_pl, a_pl

def _mask_data(playlists):
    playlists = copy.deepcopy(playlists)
    tot = len(playlists)
    song_only = playlists[:int(tot * 0.3)]
    song_and_tags = playlists[int(tot * 0.3):int(tot * 0.8)]
    tags_only = playlists[int(tot * 0.8):int(tot * 0.95)]
    title_only = playlists[int(tot * 0.95):]

    print(f"Total: {len(playlists)}, "
            f"Song only: {len(song_only)}, "
            f"Song & Tags: {len(song_and_tags)}, "
            f"Tags only: {len(tags_only)}, "
            f"Title only: {len(title_only)}")

    song_q, song_a = _mask(song_only, ['songs'], ['tags'])
    songtag_q, songtag_a = _mask(song_and_tags, ['songs', 'tags'], [])
    tag_q, tag_a = _mask(tags_only, ['tags'], ['songs'])
    title_q, title_a = _mask(title_only, [], ['songs', 'tags'])

    q = song_q + songtag_q + tag_q + title_q
    a = song_a + songtag_a + tag_a + title_a

    shuffle_indices = np.arange(len(q))
    np.random.shuffle(shuffle_indices)

    q = list(np.array(q)[shuffle_indices])
    a = list(np.array(a)[shuffle_indices])

    return q, a

In [4]:
##train = pd.read_json('../file/train.json', encoding='utf-8')
#song_meta = pd.read_json('../file/song_meta.json', encoding='utf-8')


In [5]:
random.seed(777)
fname = '../file/train.json'
print("Reading data...\n")
playlists = load_json(fname)
random.shuffle(playlists)
print(f"Total playlists: {len(playlists)}")

print("Splitting data...")
train, val = _split_data(playlists)

Reading data...

Total playlists: 115071
Splitting data...


In [5]:
print(len(train),len(val))

92056 23015


In [6]:
train_q, train_a = _mask_data(playlists)
print(len(train_q),len(train_a))


Total: 115071, Song only: 34521, Song & Tags: 57535, Tags only: 17261, Title only: 5754
115071 115071


In [37]:
print("Masked...")
write_json(train_q, "train_q.json")
write_json(train_a, "train_a.json")

Masked...


In [5]:
plylst = pd.read_json('../file/train.json', encoding='utf-8')
plylst_train_q = pd.read_json('arena_data/train_q.json', encoding='utf-8')
plylst_train_a = pd.read_json('arena_data/train_a.json', encoding='utf-8')
#song_meta = pd.read_json('../file/song_meta.json', encoding='utf-8')


In [6]:
train_q = load_json('arena_data/train_q.json')
train_a = load_json('arena_data/train_a.json')

In [58]:
# Data Preprocessing 

In [7]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
#song_selected = [_[0] for _ in list(filter(lambda e: e[1] > 100, Counter(songs).items()))]
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_dict_100 = {x: song_counter[x] for x in song_counter if song_counter[x]>100}

song_id_sid = dict()
song_sid_id = dict()

song_100_id_sid = dict()
song_100_sid_id = dict()

for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

for i, t in enumerate(song_dict_100):
    song_100_id_sid[t] = i
    song_100_sid_id[i] = t

n_songs = len(song_dict)
n_songs_100 = len(song_dict_100)
n_plylst = len(plylst)

In [8]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_100_id_sid.get(s) for s in x if song_100_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])


plylst_train_q['songs_id'] = plylst_train_q['songs'].map(lambda x: [song_100_id_sid.get(s) for s in x if song_100_id_sid.get(s) != None])
plylst_train_q['tags_id'] = plylst_train_q['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])


plylst_train_a['songs_id'] = plylst_train_a['songs'].map(lambda x: [song_100_id_sid.get(s) for s in x if song_100_id_sid.get(s) != None])
plylst_train_a['tags_id'] = plylst_train_a['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])

In [9]:
train_q_list = plylst_train_q['songs_id'].tolist()
train_a_list = plylst_train_a['songs_id'].tolist()
print(len(train_q_list))


115071


In [8]:
print(n_songs_100)
n_tags = len(tag_dict)
print(n_tags)

8491
29160


In [74]:
def one_hot(train):
    X = np.array([enc.transform(np.array(song_list).reshape(-1,1)).toarray().sum(axis=0) if song_list != [] else np.zeros(n_songs_100) for song_list in train.songs_id])
    return X

In [75]:
X_q = one_hot(plylst_train_q)
X_a = one_hot(plylst_train_a)

In [78]:
print(X_q.shape, X_a.shape)

(115071, 8491) (115071, 8491)


# Model

In [10]:
#from models import MLP
model = MLP(n_tags, n_songs, layers=[1024, 256, 1024], dropout=False, use_cuda = True)
'''
    args = parse_args()
    path = args.path
    dataset = args.dataset
    layers = eval(args.layers)
    weight_decay = args.weight_decay
    num_negatives_train = args.num_neg_train
    num_negatives_test = args.num_neg_test
    dropout = args.dropout
    learner = args.learner
    learning_rate = args.lr
    batch_size = args.batch_size
    epochs = args.epochs
    verbose = args.verbose
'''

'\n    args = parse_args()\n    path = args.path\n    dataset = args.dataset\n    layers = eval(args.layers)\n    weight_decay = args.weight_decay\n    num_negatives_train = args.num_neg_train\n    num_negatives_test = args.num_neg_test\n    dropout = args.dropout\n    learner = args.learner\n    learning_rate = args.lr\n    batch_size = args.batch_size\n    epochs = args.epochs\n    verbose = args.verbose\n'

In [11]:
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=5e-4)

In [12]:
from training import Trainer
trainer = Trainer(model, optimizer,
                 print_loss_every=50,
                 record_loss_every=5,
                 use_cuda = True)

In [13]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
# Dataset 상속
class CustomDataset(Dataset): 
  def __init__(self, xinput, yinput, batchsize):
    self.x_data = xinput
    self.y_data = yinput
    self.batch_size = batchsize

  # 총 데이터의 개수를 리턴
  def __len__(self): 
    return len(self.x_data)

  # 인덱스를 입력받아 그에 맵핑되는 입출력 데이터를 파이토치의 Tensor 형태로 리턴
  def __getitem__(self, idx): 
    x = torch.FloatTensor(self.x_data[idx])
    y = torch.FloatTensor(self.y_data[idx])
    return x, y

mydata = CustomDataset(train_q_list, train_a_list, 50)


In [14]:
for i in range(5):
    x,y=mydata[i]
    print(x,y)

tensor([]) tensor([5724., 2634., 1332.])
tensor([ 959., 2739., 4066., 2748.]) tensor([6475., 2737., 1342., 2796., 2741., 2750.])
tensor([1691.,  147.,  148., 1048., 1694.]) tensor([1362.,  149., 1693.,  150., 1049.])
tensor([]) tensor([1455., 4941., 5575., 3786., 5576., 5577., 5357., 5358., 4243., 5201.,
        5583., 3443., 1456., 3444., 6684., 6579., 3788.,  603.,  842., 1565.,
        4889., 2123., 1881., 3342., 4062., 1336., 4069., 1956.,  877., 3689.,
         278.,  292., 5350., 4516.,  384., 1963.,  437., 2252., 3350., 1446.,
        1325.,  305., 2997., 4719.,  851., 4051., 1401., 6819., 5352., 4237.,
        4040., 3384., 4036., 4728., 4240., 5580., 8464., 4068., 5581., 4033.,
        4049., 3437., 4045., 4047., 4953., 4070., 3781., 5582., 3440., 2255.,
        3441., 3442.,  981., 5572., 3759.])
tensor([4.8450e+03, 1.4600e+03, 3.0000e+00, 4.1280e+03, 4.2990e+03, 1.4980e+03,
        6.9990e+03, 2.7200e+02, 8.2260e+03, 6.5800e+03, 1.5640e+03, 1.2960e+03,
        3.3750e+03, 1.

In [15]:
trainer = Trainer(model, optimizer)
trainer.train(mydata)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

# main

In [63]:
model = MLP(n_tags, n_songs)

In [68]:
from torch import optim
# Build optimizer
optimizer = optim.Adam(model.parameters(), lr=5e-4)

In [101]:
# full_dataset = CustomDataset(train_q, train_a)
train_loader_q = DataLoader(train_q, batch_size=batch_size, shuffle=False, num_workers=0)
train_loader_a = DataLoader(train_a, batch_size=batch_size, shuffle=False, num_workers=0)

# .train(epochs=50)

In [102]:
type(train_loader_q)

torch.utils.data.dataloader.DataLoader

# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
# cudnn.benchmark = True


def main():

    args = parse_args()
    path = args.path
    dataset = args.dataset
    layers = eval(args.layers)
    weight_decay = args.weight_decay
    num_negatives_train = args.num_neg_train
    num_negatives_test = args.num_neg_test
    dropout = args.dropout
    learner = args.learner
    learning_rate = args.lr
    batch_size = args.batch_size
    epochs = args.epochs
    verbose = args.verbose

    topK = 100
    print("MLP arguments: %s " % (args))
    # model_out_file = 'Pretrain/%s_MLP_%s_%d.h5' %(args.dataset, args.layers, time())

    # Load data

    t1 = time()
    full_dataset = CustomDataset(train_q_list, train_a_list)

    train_data_q, train_data_a = full_dataset.x_data, full_dataset.y_data
    num_data = len(full_dataset)

    print("Load data done [%.1f s]. #user=%d, #item=%d"
          % (time()-t1, num_data, n_songs_100))
    
    #dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
    
    training_data_generator = DataLoader(
        full_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

    # Build model
    model = MLP(num_data, n_songs_100, layers=layers, dropout=dropout)
    # Transfer the model to GPU, if one is available
    model.to(device)
    if verbose:
        print(model)

    loss_fn = torch.nn.BCELoss()
    # Use Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), weight_decay=weight_decay)

    # Record performance
    hr_list = []
    ndcg_list = []
    BCE_loss_list = []

    # Check Init performance
    hr, ndcg = test(model, full_dataset, topK)
    hr_list.append(hr)
    ndcg_list.append(ndcg)
    BCE_loss_list.append(1)
    # do the epochs now

    for epoch in range(epochs):
        epoch_loss = train_one_epoch( model, training_data_generator, loss_fn, optimizer, epoch, device)

        if epoch % verbose == 0:
            hr, ndcg = test(model, full_dataset, topK)
            hr_list.append(hr)
            ndcg_list.append(ndcg)
            BCE_loss_list.append(epoch_loss)
            # if hr > best_hr:
            #     best_hr, best_ndcg, best_iter = hr, ndcg, epoch
            #     if args.out > 0:
            #         model.save(model_out_file, overwrite=True)
    print("hr for epochs: ", hr_list)
    print("ndcg for epochs: ", ndcg_list)
    print("loss for epochs: ", BCE_loss_list)
    # plot_statistics(hr_list, ndcg_list, BCE_loss_list,model.get_alias(), "./figs")
    # with open("metrics", 'wb') as fp:
    #     pickle.dump(hr_list, fp)
    #     pickle.dump(ndcg_list, fp)

    best_iter = np.argmax(np.array(hr_list))
    best_hr = hr_list[best_iter]
    best_ndcg = ndcg_list[best_iter]
    print("End. Best Iteration %d:  HR = %.4f, NDCG = %.4f. " %
          (best_iter, best_hr, best_ndcg))
    # if args.out > 0:
    #     print("The best MLP model is saved to %s" %(model_out_file))


if __name__ == "__main__":
    print("Device available: {}".format(device))
    main()