## 필요한 모듈 불러오기

In [22]:
import numpy as np
import pandas as pd
import json
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import csr_matrix
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares
import os; os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from collections import Counter
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
mpl.pyplot.rc('font', family='Malgun Gothic')
plt.rcParams["figure.figsize"] = (20, 10)
plt.style.use('ggplot')

## 데이터셋 불러오기

In [6]:
# 3. 플레이리스트 데이터
playlists_df = pd.read_json('train.json', typ = 'frame')
# playlists_df.head()

In [7]:
# 플레이리스트 아이디에 대한 노래id, 좋아요 수
df_mf = playlists_df[['id', 'songs', 'like_cnt']]
df_mf.head()

Unnamed: 0,id,songs,like_cnt
0,61281,"[525514, 129701, 383374, 562083, 297861, 13954...",71
1,10532,"[432406, 675945, 497066, 120377, 389529, 24427...",1
2,76951,"[83116, 276692, 166267, 186301, 354465, 256598...",17
3,147456,"[394031, 195524, 540149, 287984, 440773, 10033...",33
4,27616,"[159327, 553610, 5130, 645103, 294435, 100657,...",9


## Matrix Factorization을 위한 R 행렬 만들기

In [8]:
# 1. 플레이리스트ID - 개별 곡 - 좋아요 수 데이터 만들기 
LF_Matrix = np.dstack(
    (
        np.repeat((df_mf.id.values), list(map(len, df_mf.songs))),
        np.concatenate(df_mf.songs.values),
        np.repeat(df_mf.like_cnt.values, list(map(len, df_mf.songs)))
    )
)

In [9]:
LF_Matrix

array([[[ 61281, 525514,     71],
        [ 61281, 129701,     71],
        [ 61281, 383374,     71],
        ...,
        [100389, 640239,     17],
        [100389,  13759,     17],
        [100389, 154078,     17]]])

In [10]:
# 2. unnested 데이터프레임 생성 : song_gnr_map
LF_MATRIX = pd.DataFrame(data = LF_Matrix[0], columns = df_mf.columns)

In [11]:
LF_MATRIX['id'] = LF_MATRIX['id'].astype(str)
LF_MATRIX['songs'] = LF_MATRIX['songs'].astype(str)
#del LF_Matrix

In [12]:
LF_MATRIX.head()

Unnamed: 0,id,songs,like_cnt
0,61281,525514,71
1,61281,129701,71
2,61281,383374,71
3,61281,562083,71
4,61281,297861,71


In [13]:
# 3. 좋아요 수 0~1 구간으로 스케일링(적용은 X)
tmp = LF_MATRIX['like_cnt'].values
tmp = np.reshape(tmp, (len(tmp),1))
scaler = MinMaxScaler()
scaler.fit(tmp)
tmp = scaler.transform(tmp)
tmp = np.reshape(tmp, (len(tmp), ))
LF_MATRIX['ratings'] = tmp

In [14]:
LF_MATRIX.head()

Unnamed: 0,id,songs,like_cnt,ratings
0,61281,525514,71,0.001334
1,61281,129701,71,0.001334
2,61281,383374,71,0.001334
3,61281,562083,71,0.001334
4,61281,297861,71,0.001334


In [15]:
# LF_MATRIX = LF_MATRIX.drop(['like_cnt'], axis=1)

In [16]:
# pivot_LF = LF_MATRIX.pivot_table('like_cnt', index='id', columns='songs')
# pivot_LF.head()

In [18]:
LF_MATRIX.id.values

array(['61281', '61281', '61281', ..., '100389', '100389', '100389'],
      dtype=object)

In [19]:
# 4. CSR(Compressed Sparse Row) 매트릭스의 인덱스로 압축하여 저장한다. 0이 많은 경우 압축률이 좋음
R = csr_matrix((LF_MATRIX['like_cnt'].astype('int'), 
                   (LF_MATRIX['id'].astype('int'), 
                    LF_MATRIX['songs'].astype('int'))))

In [20]:
print("{}".format(R.shape))
print('='*50)
print("{}".format(R))

(153429, 707989)
  (1, 47805)	2
  (1, 117747)	2
  (1, 308020)	2
  (1, 418970)	2
  (1, 662131)	2
  (2, 12130)	11
  (2, 53980)	11
  (2, 92908)	11
  (2, 115311)	11
  (2, 147122)	11
  (2, 160086)	11
  (2, 244008)	11
  (2, 356975)	11
  (2, 379267)	11
  (2, 424482)	11
  (2, 577345)	11
  (2, 618641)	11
  (2, 642016)	11
  (2, 653647)	11
  (2, 669547)	11
  (2, 672598)	11
  (4, 10961)	5
  (4, 13960)	5
  (4, 16293)	5
  (4, 64641)	5
  :	:
  (153428, 302451)	24
  (153428, 309158)	24
  (153428, 329704)	24
  (153428, 367027)	24
  (153428, 375895)	24
  (153428, 377651)	24
  (153428, 383749)	24
  (153428, 388090)	24
  (153428, 411756)	24
  (153428, 415774)	24
  (153428, 426013)	24
  (153428, 430137)	24
  (153428, 438796)	24
  (153428, 452764)	24
  (153428, 472379)	24
  (153428, 474886)	24
  (153428, 529352)	24
  (153428, 539892)	24
  (153428, 565582)	24
  (153428, 577573)	24
  (153428, 586043)	24
  (153428, 594017)	24
  (153428, 603581)	24
  (153428, 657833)	24
  (153428, 697746)	24


## R 행렬을 가지고 Latent Factor 분석 모델 만들기

In [23]:
# initialize a model
model = AlternatingLeastSquares(factors=128, regularization=0.08, use_gpu=False)



In [24]:
# train the model on a sparse matrix of item/user/confidence weights
model.fit(R)

HBox(children=(FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [25]:
print("{}".format(model))

<implicit.als.AlternatingLeastSquares object at 0x7f07c76baa20>


In [26]:
# 유저-아이템 / 아이템-유저 행렬의 잠재요인 결과 보기
user_factors = model.user_factors
item_factors = model.item_factors.T

In [27]:
print(user_factors.shape)
print(item_factors.shape)

(707989, 128)
(128, 153429)


In [28]:
# 각 플레이리스트 id에 대한 유저들의 곡 선호도 TOP N개 확인
recommendations = model.recommend(userid=5, user_items=R, N=5)
recommendations

[(44509, 0.61547005),
 (104507, 0.52632225),
 (93910, 0.4115721),
 (49387, 0.3939622),
 (73935, 0.3841496)]

## R 행렬 데이터 저장

In [50]:
from scipy import sparse

#Save
sparse.save_npz('R_data.npz', R)