In [1]:
import os
import numpy as np
import pandas as pd
from scipy.sparse import *
from collections import Counter
from sklearn.utils import shuffle
os.environ['KMP_DUPLICATE_LIB_OK']='True'

from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR

In [2]:
data_path = "../data/"

train = pd.read_json(os.path.join(data_path, "train.json"))
test  = pd.read_json(os.path.join(data_path, "test.json"))

In [3]:
tag_count = dict(Counter([__ for _ in train.tags for __ in _]))
len(tag_count)

29160

In [4]:
sorted_tag = sorted(tag_count.items(), key=lambda x: -x[1])
top_tags   = [x[0] for x in sorted_tag[:10]]
top_tags

['기분전환', '감성', '휴식', '발라드', '잔잔한', '드라이브', '힐링', '사랑', '새벽', '밤']

In [17]:
# train test split
songs_train = train.songs.tolist()
songs_test = test.songs.tolist()

tags_train = train.tags.tolist()
tags_test  = test.tags.tolist()
ids_test   = test.id.tolist()

print(f"train set: {len(songs_train)}, test set: {len(songs_test)}")

train set: 115071, test set: 10740


In [6]:
# vectorizor

from itertools import groupby
vectored_train    = []
vectored_item_ids = {}
vectored_tags     = {}
idx = 0

for i, l in enumerate(songs_train):
    view = l
    for item_id in view:
        if item_id not in vectored_item_ids:
            vectored_item_ids[item_id] = idx
            idx += 1
    view = [vectored_item_ids[x] for x in view]
    vectored_train.append(view)

n_items = 0
for i, tags in enumerate(tags_train):
    for tag in tags:
        if tag not in vectored_tags:
            vectored_tags[tag] = n_items + idx
            idx += 1
    vectored_train[i].extend([vectored_tags[x] for x in tags])
n_items = len(vectored_item_ids)
n_tags = len(vectored_tags)

print(f"number of item: {n_items}, number of tags: {n_tags}")

number of item: 615142, number of tags: 29160


In [7]:
vectored_test = []

for i, l in enumerate(songs_test):
    view = l
    ret = []
    for item_id in view:
        if item_id not in vectored_item_ids:
            continue
        ret.append(vectored_item_ids[item_id])
    vectored_test.append(ret)

for i, tags in enumerate(tags_test):
    ret = []
    for tag in tags:
        if tag not in vectored_tags:
            continue
        ret.append(tag)
    vectored_test[i].extend([vectored_tags[x] for x in ret])

In [8]:
vectored_train = shuffle(vectored_train)
vectored_item_ids = {x:y for(y,x) in vectored_item_ids.items()}
vectored_tags = {(x - n_items):y for(y,x) in vectored_tags.items()}

In [9]:
from scipy.sparse import csr_matrix

row, col, value = [],[],[]
for i, n in enumerate(vectored_train):
    row.extend([i]*len(n))
    col.extend(np.arange(len(n)))
    value.extend(n)

csr_train = csr_matrix((value, (row, col)))


In [10]:
row, col, value = [],[],[]
for i, n in enumerate(vectored_train):
    row.extend([i]*len(n))
    col.extend(np.arange(len(n)))
    value.extend(n)

csr_test  = csr_matrix((value, (row, col)))

In [11]:
from scipy.sparse import vstack
csr_whole = vstack([csr_test, csr_train])
csr_whole = csr_matrix(csr_whole)


In [13]:
als_model = ALS(factors=128, regularization=0.08)
als_model.fit(csr_whole.T * 15.0)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [14]:
item_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)
item_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

In [15]:
item_model.item_factors = als_model.item_factors[:n_items]
tag_model.item_factors = als_model.item_factors[n_items:]

item_rec_csr = csr_train[:, :n_items]
tag_rec_csr = csr_train[:, n_items:]

In [16]:
item_ret = []
tag_ret = []
from tqdm.auto import tqdm
for u in tqdm(range(csr_test.shape[0])):
    item_rec = item_model.recommend(u, item_rec_csr, N=100)
    item_rec = [vectored_item_ids[x[0]] for x in item_rec]
    tag_rec = tag_model.recommend(u, tag_rec_csr, N=100)
    tag_rec = [vectored_tags[x[0]] for x in tag_rec if x[0] in vectored_tags]
    item_ret.append(item_rec)
    tag_ret.append(tag_rec)

HBox(children=(FloatProgress(value=0.0, max=115071.0), HTML(value='')))




In [18]:
tag_model.item_factors

array([], shape=(0, 128), dtype=float32)

In [19]:
returnval = []
for _id, rec, tag_rec in zip(ids_test, item_ret, tag_ret):
    returnval.append({
        "id": _id,
        "songs": rec[:100],
        "tags": tag_rec[:10]
    })

In [20]:
returnval

[{'id': 70107,
  'songs': [489432,
   359101,
   598239,
   146266,
   327354,
   314344,
   343677,
   231897,
   487106,
   33244,
   402984,
   222305,
   10505,
   402784,
   304687,
   181670,
   635753,
   568089,
   348422,
   96545,
   342582,
   371498,
   394489,
   418694,
   231154,
   63146,
   324136,
   250477,
   684625,
   430005,
   664555,
   617473,
   405687,
   51834,
   349120,
   583375,
   457451,
   135950,
   157283,
   214115,
   84285,
   360131,
   398171,
   261084,
   102889,
   295010,
   65059,
   569867,
   47409,
   643573,
   664191,
   311997,
   657895,
   199965,
   296376,
   700050,
   556250,
   493762,
   307985,
   263364,
   559269,
   239410],
  'tags': []},
 {'id': 7461,
  'songs': [336743,
   645103,
   489432,
   567076,
   503552,
   35784,
   327354,
   153029,
   300104,
   231154,
   100335,
   631142,
   587291,
   348801,
   243754,
   542735,
   159327,
   25155,
   402984,
   222305,
   24275,
   203558,
   284990,
   394031,
  

In [21]:
import json
with open('ret.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(returnval, ensure_ascii=False))