# 파일 입출력 설정

In [1]:
import io
import os
import json
import distutils.dir_util
from collections import Counter

import numpy as np

def write_json(data, fname):
    def _conv(o):
        if isinstance(o, np.int64) or isinstance(o, np.int32):
            return int(o)
        raise TypeError

    parent = os.path.dirname(fname)
    distutils.dir_util.mkpath("./" + parent)
    with io.open("./" + fname, "w", encoding="utf8") as f:
        json_str = json.dumps(data, ensure_ascii=False, default=_conv)
        f.write(json_str)


def load_json(fname):
    with open(fname, encoding='utf8') as f:
        json_obj = json.load(f)

    return json_obj


def debug_json(r):
    print(json.dumps(r, ensure_ascii=False, indent=4))

In [2]:
import pickle
import numpy as np
import pandas as pd
import scipy.sparse as spr
from collections import Counter
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from scipy.sparse import hstack, vstack
from tqdm.auto import tqdm

song_meta = pd.read_json("./../file/song_meta.json")
# train = pd.read_json("./../file/train_pos.json")
# test = pd.read_json("./../file/validation_pos.json")

In [3]:
train = pd.read_pickle('../file/train_pos.pickle')

In [11]:
test = pd.read_pickle('../file/validation_pos.pickle')

In [13]:
train['all_word'] = train.tags + train.genre_kor + train.plyst_title_pos
test['all_word'] = test.tags + test.genre_kor + test.plyst_title_pos

In [15]:
from gensim.models import Word2Vec

In [18]:
tt = train.all_word + test.all_word
tt = tt.dropna()
tt = list(tt)

In [19]:
w2v = Word2Vec(tt, size = 500)

In [20]:
def get_wv_mean(word_list):
    wv_mean = []
    for w in word_list:
        try:
            wv_mean.append(np.array(w2v.wv[w]))
        except Exception as e:
            pass
    if not wv_mean:
        return []
    return np.array(wv_mean).mean(axis=0)

In [21]:
train['vectored'] = train.all_word.apply(lambda e: get_wv_mean(e))
test['vectored'] = test.all_word.apply(lambda e: get_wv_mean(e))

# 데이터셋 생성

In [28]:
train['istrain'] = 1
test['istrain'] = 0

n_train = len(train)
n_test = len(test)

# train + test
plylst = pd.concat([train, test], ignore_index=True)

# playlist id
plylst["nid"] = range(n_train + n_test)

# id <-> nid
plylst_id_nid = dict(zip(plylst["id"],plylst["nid"]))
plylst_nid_id = dict(zip(plylst["nid"],plylst["id"]))

# index 부여

In [29]:
plylst_tag = plylst['tags']
tag_counter = Counter([tg for tgs in plylst_tag for tg in tgs])
tag_dict = {x: tag_counter[x] for x in tag_counter}

tag_id_tid = dict()
tag_tid_id = dict()
for i, t in enumerate(tag_dict):
    tag_id_tid[t] = i
    tag_tid_id[i] = t

n_tags = len(tag_dict)

plylst_song = plylst['songs']
song_counter = Counter([sg for sgs in plylst_song for sg in sgs])
song_dict = {x: song_counter[x] for x in song_counter}

song_id_sid = dict()
song_sid_id = dict()
for i, t in enumerate(song_dict):
    song_id_sid[t] = i
    song_sid_id[i] = t

n_songs = len(song_dict)

0         [-0.13327028, -0.04235815, -0.24068263, -0.233...
1         [-0.08894731, -0.003893503, -0.0747035, 0.0303...
2         [-0.07311141, -0.124093056, 0.17151015, 0.0105...
3         [-0.0033125186, -0.0040749004, 0.12479437, 0.1...
4         [-0.094459996, 0.2342832, -0.044105448, 0.1128...
                                ...                        
138081    [-0.0699301, -0.07824149, 0.11491039, 0.140681...
138082    [-0.15912956, -0.0077224365, 0.14708532, 0.190...
138083    [-0.0027779818, -0.10349687, -0.2909028, -0.38...
138084    [0.06729251, -0.12507215, 0.035921358, -0.0133...
138085    [-0.20647204, -0.004208231, 0.1137497, 0.06362...
Length: 138086, dtype: object

In [52]:
plylst_use.shape

(138086, 6)

In [37]:
plylst['songs_id'] = plylst['songs'].map(lambda x: [song_id_sid.get(s) for s in x if song_id_sid.get(s) != None])
plylst['tags_id'] = plylst['tags'].map(lambda x: [tag_id_tid.get(t) for t in x if tag_id_tid.get(t) != None])
plylst['w2v'] = plylst['vectored']
plylst_use = plylst[['istrain','nid','updt_date','songs_id','tags_id']]

# 곡의 개수와 태그의 개수를 할당
plylst_use.loc[:,'num_songs'] = plylst_use['songs_id'].map(len)
plylst_use.loc[:,'num_tags'] = plylst_use['tags_id'].map(len)
plylst_use = plylst_use.set_index('nid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [56]:
plylst_use['vectored'] = pd.Series(list(train.vectored) + list(test.vectored))

In [90]:
# train test split
plylst_train = plylst_use.iloc[:n_train,:]
plylst_test = plylst_use.iloc[n_train:,:]

In [75]:
# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_train), plylst_train['num_songs'])
col = [song for songs in plylst_train['songs_id'] for song in songs]
dat = np.repeat(1, plylst_train['num_songs'].sum())
train_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_songs))

row = np.repeat(range(n_train), plylst_train['num_tags'])
col = [tag for tags in plylst_train['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_train['num_tags'].sum())
train_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_train, n_tags))

# csr (compressed sparse row matrix 생성)
row = np.repeat(range(n_test), plylst_test['num_songs'])
col = [song for songs in plylst_test['songs_id'] for song in songs]
dat = np.repeat(1, plylst_test['num_songs'].sum())
test_songs_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_songs))

row = np.repeat(range(n_test), plylst_test['num_tags'])
col = [tag for tags in plylst_test['tags_id'] for tag in tags]
dat = np.repeat(1, plylst_test['num_tags'].sum())
test_tags_A = spr.csr_matrix((dat, (row, col)), shape=(n_test, n_tags))

In [91]:
plylst_train['vectored'] = plylst_train.vectored.apply(lambda e: e if len(e) else np.random.random(500))
plylst_test['vectored'] = plylst_test.vectored.apply(lambda e: e if len(e) else np.random.random(500))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [97]:
for i, wv_trn in enumerate(w2v_train):
    vectored_train[i].extend(wv_trn)

ValueError: row, column, and data array must all be the same length

In [None]:
train_merged_feature_csr = hstack([train_songs_A, train_tags_A, train_w2v_a])
test_merged_feature_csr  = hstack([test_songs_A, test_tags_A, test_w2v_a])

train_songs_A_T = train_songs_A.T.tocsr()
train_tags_A_T  = train_tags_A.T.tocsr()

whole_datasets_csr       = vstack([train_merged_feature_csr, test_merged_feature_csr])
whole_datasets_csr_T     = whole_datasets_csr.T.tocsr()

In [9]:
whole_datasets_csr_T.toarray().shape

(668533, 138086)

In [42]:
def rec(pids):
    res = []

    for pid in tqdm(pids):
        p = np.zeros((n_songs,1))
        p[plylst_test.loc[pid,'songs_id']] = 1

        val = train_songs_A.dot(p).reshape(-1)

        songs_already = plylst_test.loc[pid, "songs_id"]
        tags_already = plylst_test.loc[pid, "tags_id"]

        cand_song = train_songs_A_T.dot(val)
        cand_song_idx = cand_song.reshape(-1).argsort()[-250:][::-1]

        cand_song_idx = cand_song_idx[np.isin(cand_song_idx, songs_already) == False][:100]
        rec_song_idx = [song_sid_id[i] for i in cand_song_idx]

        cand_tag = train_tags_A_T.dot(val)
        cand_tag_idx = cand_tag.reshape(-1).argsort()[-15:][::-1]

        cand_tag_idx = cand_tag_idx[np.isin(cand_tag_idx, tags_already) == False][:10]
        rec_tag_idx = [tag_tid_id[i] for i in cand_tag_idx]

        res.append({
            "id": plylst_nid_id[pid],
            "songs": rec_song_idx,
            "tags": rec_tag_idx
        })
        
    return res

In [43]:
res = rec(plylst_test.index)

HBox(children=(FloatProgress(value=0.0, max=23015.0), HTML(value='')))




In [None]:
# n_factor = 128
# n_epoch  = 15.0

# als_model = ALS(factors=n_factor, regularization=0.08)
# als_model.fit(whole_datasets_csr_T * n_epoch)

# song_model = ALS(use_gpu=False)
# tag_model = ALS(use_gpu=False)
# song_model.user_factors = als_model.user_factors
# tag_model.user_factors = als_model.user_factors

# song_model.item_factors = als_model.item_factors[:n_songs]
# tag_model.item_factors  = als_model.item_factors[n_songs:]

# print(song_model.item_factors.shape)
# print(tag_model.item_factors.shape)

In [None]:
# res = []



# for u in tqdm(range(test_merged_feature_csr.shape[0])):
#     song_rec = song_model.recommend(u, test_songs_A, N=100)
#     song_rec = [song_sid_id[x[0]] for x in song_rec]
    
#     tag_rec = tag_model.recommend(u, test_tags_A, N=10)
#     tag_rec = [tag_tid_id[x[0]] for x in tag_rec]
    
#     res.append({
#             # train test의 vconcat 된 matrix에서 id를 추출하는 것이기 때문에 n_train 이후부터 test 데이터 셋임
#             # 따라서 u + n_train이 각각의 test 셋에서의 id
#             "id": plylst_nid_id[u + n_train],
#             "songs": song_rec,
#             "tags": tag_rec
#         })

In [45]:
# 결과 저장
write_json(res, "results.json")