In [1]:
import pandas as pd
import os

# 데이터탐색 & 전처리

In [2]:
fname = '../../data/rec_data/usersha1-artmbid-artname-plays.tsv'
col_names = ['user_id', 'artist_MBID', 'artist', 'play']   # 임의로 지정한 컬럼명
data = pd.read_csv(fname, sep='\t', names= col_names)      # sep='\t'로 주어야 tsv를 열 수 있습니다.  
data.head(3)

Unnamed: 0,user_id,artist_MBID,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897


In [3]:
#사용할 컬럼 필터링
using_cols = ['user_id', 'artist', 'play']
data = data[using_cols]
data.head(3)

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897


In [4]:
 #아티스트명 소문자로
data['artist'] = data['artist'].str.lower()
data.head(3)

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897


In [5]:
#첫번째 유저의 재생 이력 확인
condition = (data['user_id'] == data.loc[0, 'user_id'])
data[condition][:10]

Unnamed: 0,user_id,artist,play
0,00000c289a1829a808ac09c00daf10bc3c4e223b,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,die ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,juliette & the licks,706
5,00000c289a1829a808ac09c00daf10bc3c4e223b,red hot chili peppers,691
6,00000c289a1829a808ac09c00daf10bc3c4e223b,magica,545
7,00000c289a1829a808ac09c00daf10bc3c4e223b,the black dahlia murder,507
8,00000c289a1829a808ac09c00daf10bc3c4e223b,the murmurs,424
9,00000c289a1829a808ac09c00daf10bc3c4e223b,lunachicks,403


## EDA

In [6]:
# 유저 수
data['user_id'].nunique()

358868

In [7]:
# 아티스트 수
data['artist'].nunique()

291346

In [8]:
# 인기 많은 아티스트
artist_count = data.groupby('artist')['user_id'].count()
artist_count.sort_values(ascending=False).head(30)

artist
radiohead                77254
the beatles              76245
coldplay                 66658
red hot chili peppers    48924
muse                     46954
metallica                45233
pink floyd               44443
the killers              41229
linkin park              39773
nirvana                  39479
system of a down         37267
queen                    34174
u2                       33206
daft punk                33001
the cure                 32624
led zeppelin             32295
placebo                  32072
depeche mode             31916
david bowie              31862
bob dylan                31799
death cab for cutie      31482
arctic monkeys           30348
foo fighters             30144
air                      29795
the rolling stones       29754
nine inch nails          28946
sigur rós                28901
green day                28732
massive attack           28691
moby                     28232
Name: user_id, dtype: int64

In [9]:
# 유저별 몇 명의 아티스트를 듣고 있는지에 대한 통계
user_count = data.groupby('user_id')['artist'].count()
user_count.describe()

count    358868.000000
mean         48.863234
std           8.524272
min           1.000000
25%          46.000000
50%          49.000000
75%          51.000000
max         166.000000
Name: artist, dtype: float64

In [10]:
# 유저별 play횟수 중앙값에 대한 통계
user_median = data.groupby('user_id')['play'].median()
user_median.describe()

count    358868.000000
mean        142.187676
std         213.089902
min           1.000000
25%          32.000000
50%          83.000000
75%         180.000000
max       50142.000000
Name: play, dtype: float64

In [11]:
# 테스트용 좋아하는 아티스트 목록
my_favorite = ['black eyed peas' , 'maroon5' ,'jason mraz' ,'coldplay' ,'beyoncé']

# 'sc'라는 user_id가 위 아티스트의 노래를 30회씩 들었다고 가정
my_playlist = pd.DataFrame({'user_id': ['sc']*5, 'artist': my_favorite, 'play':[30]*5})

if not data.isin({'user_id':['sc']})['user_id'].any():
    data = pd.concat([data, my_playlist])                           

data.tail(10)       # 잘 추가되었는지 확인

Unnamed: 0,user_id,artist,play
17535650,"sep 20, 2008",turbostaat,12
17535651,"sep 20, 2008",cuba missouri,11
17535652,"sep 20, 2008",little man tate,11
17535653,"sep 20, 2008",sigur rós,10
17535654,"sep 20, 2008",the smiths,10
0,sc,black eyed peas,30
1,sc,maroon5,30
2,sc,jason mraz,30
3,sc,coldplay,30
4,sc,beyoncé,30


In [12]:
# 고유한 유저, 아티스트를 찾아내는 코드
user_unique = data['user_id'].unique()
artist_unique = data['artist'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
artist_to_idx = {v:k for k,v in enumerate(artist_unique)}

In [13]:
# 인덱싱이 잘 되었는지 확인
print(user_to_idx['sc'])    # 358869명의 유저 중 마지막 (358868)
print(artist_to_idx['black eyed peas'])

358868
376


In [14]:
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거.
temp_user_data = data['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(data):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    data['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

user_id column indexing OK!!


In [15]:
# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱
temp_artist_data = data['artist'].map(artist_to_idx.get).dropna()
if len(temp_artist_data) == len(data):
    print('artist column indexing OK!!')
    data['artist'] = temp_artist_data
else:
    print('artist column indexing Fail!!')

data

artist column indexing OK!!


Unnamed: 0,user_id,artist,play
0,0,0,2137
1,0,1,1099
2,0,2,897
3,0,3,717
4,0,4,706
...,...,...,...
0,358868,376,30
1,358868,270115,30
2,358868,3746,30
3,358868,62,30


## 사용자의 명시적/암묵적 평가

In [16]:
# 1회만 play한 데이터의 비율
only_one = data[data['play']==1]
one, all_data = len(only_one), len(data)
print(f'{one}, {all_data}, {one/all_data*100}')
print(f'Ratio of only_one over all data is {one/all_data:.2%}')

147739, 17535660, 0.8425060704872244
Ratio of only_one over all data is 0.84%


## CSR(Compressed Sparse Row) Matrix

In [17]:
# CSR(Compressed Sparse Row) Matrix 만들어보기
from scipy.sparse import csr_matrix

num_user = data['user_id'].nunique()
num_artist = data['artist'].nunique()

csr_data = csr_matrix(
    (data.play, (data.user_id, data.artist)), # (data, (row_idx, col_idx))
    shape=(num_user, num_artist) # shape
)
csr_data

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 17535578 stored elements and shape (358869, 291347)>

## MF 모델 학습

In [18]:
from implicit.als import AlternatingLeastSquares
import numpy as np

In [19]:
# implicit 라이브러리에서 권장하고 있는 환경변수설정
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [20]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [21]:
# 모델 훈련
als_model.fit(csr_data)

  0%|          | 0/15 [00:00<?, ?it/s]

In [25]:
sc, black_eyed_peas = user_to_idx['sc'], artist_to_idx['black eyed peas']
sc_vector, black_eyed_peas_vector = als_model.user_factors[sc], als_model.item_factors[black_eyed_peas]

In [28]:
sc_vector

array([ 0.22877179, -0.20303473,  0.03338177,  0.81861335, -1.0573502 ,
        0.18578032,  0.7440194 ,  0.40050024, -0.41305432,  1.1429905 ,
       -0.14661738, -0.02663615, -0.7883047 ,  0.96973383,  1.8102185 ,
       -0.8332633 ,  0.11872415, -1.0978347 , -0.36584115,  0.2190496 ,
       -0.588063  ,  0.63987356, -0.0998169 , -0.39215705, -1.0178028 ,
        0.24335589, -0.73412293,  0.40103364, -0.03996487,  0.20778863,
        0.01815015, -0.23485501,  1.2326763 ,  0.439516  ,  1.7664452 ,
       -0.75958556,  0.17412156,  0.71230704, -0.18795915, -0.61646724,
        0.8209467 ,  0.06973193,  0.37967268,  0.9837757 , -0.44979855,
        0.8329533 , -0.38038942,  0.09299395,  0.04024565, -0.24275444,
       -0.07028056,  0.57091105,  0.6539769 , -0.2154316 ,  0.5271818 ,
       -0.59330374, -0.87650794, -0.40584984, -0.89523906,  0.7875021 ,
       -0.42861605,  0.08665853,  0.61858636,  0.09190568,  0.5465875 ,
       -0.10426328,  0.07752798,  0.10605694,  0.22754465,  0.84

In [29]:
black_eyed_peas_vector

array([ 6.6756690e-03, -4.0325774e-03,  1.6454007e-02,  8.8718329e-03,
       -6.5689920e-03,  5.8151535e-03,  2.2353617e-02,  5.0004520e-03,
        1.2782225e-02,  2.3339881e-02, -5.0714780e-03,  2.0591859e-02,
       -4.5059761e-03,  7.5416113e-03,  3.1171365e-02, -2.2960309e-02,
        2.5239255e-04, -2.2173034e-02,  5.6305481e-03,  1.2074351e-02,
        8.3560189e-03,  5.6849336e-03,  7.4799298e-03,  1.2177659e-02,
       -3.7543878e-03,  1.1574897e-02,  4.2602848e-03,  1.3727057e-02,
        1.8429419e-03,  5.2951057e-03,  7.7034337e-03,  2.2800285e-02,
        1.9814445e-02,  1.6927820e-03,  3.3953328e-02, -8.5230600e-03,
        4.1770977e-03,  9.8319259e-03,  1.2878497e-02,  6.0163043e-03,
        1.6983021e-02,  6.7961207e-03,  1.3638249e-02,  9.3473634e-03,
        1.5637218e-03,  2.0881215e-02,  9.1851186e-03, -7.5588245e-03,
       -3.2283575e-03,  5.5615515e-03,  9.2305476e-03, -9.1791543e-04,
        5.9463866e-03, -8.9421915e-03,  1.1648799e-02, -2.0267228e-03,
      

In [37]:
np.dot(sc_vector, black_eyed_peas_vector)

np.float32(0.50566137)

In [31]:
queen = artist_to_idx['queen']
queen_vector = als_model.item_factors[queen]
np.dot(zimin_vector, queen_vector)

np.float32(0.3002196)

In [41]:
#iterations 늘리기 & 훈련
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=30, dtype=np.float32)
als_model.fit(csr_data)

  0%|          | 0/30 [00:00<?, ?it/s]

In [59]:
sc_vector, black_eyed_peas_vector = als_model.user_factors[sc], als_model.item_factors[black_eyed_peas]
np.dot(sc_vector, black_eyed_peas_vector)

np.float32(0.5538964)

## 비슷한 아티스트 찾기

In [43]:
# coldplay와 비슷한 아티스트 찾기
favorite_artist = 'coldplay'
artist_id = artist_to_idx[favorite_artist]
similar_artist = als_model.similar_items(artist_id, N=15)
similar_artist

(array([  62,  277,    5,  473,  217,   28,  247,  418,  490,  910,  268,
         782,   75, 1018,  694], dtype=int32),
 array([1.        , 0.9899748 , 0.97755647, 0.97195065, 0.96783453,
        0.96483654, 0.9638269 , 0.94786745, 0.94740206, 0.94391966,
        0.9358689 , 0.92589045, 0.92497736, 0.92239434, 0.9194513 ],
       dtype=float32))

In [56]:
#artist_to_idx 를 뒤집어, index로부터 artist 이름을 얻는 dict를 생성
idx_to_artist = {v:k for k,v in artist_to_idx.items()}
[idx_to_artist[i] for i in similar_artist[0]]

['coldplay',
 'muse',
 'red hot chili peppers',
 'placebo',
 'radiohead',
 'the killers',
 'the beatles',
 'u2',
 'oasis',
 'nirvana',
 'pink floyd',
 'the white stripes',
 'queen',
 'the smashing pumpkins',
 'foo fighters']

In [57]:
def get_similar_artist(artist_name: str):
    artist_id = artist_to_idx[artist_name]
    similar_artist = als_model.similar_items(artist_id)
    similar_artist = [idx_to_artist[i] for i in similar_artist[0]]
    return similar_artist

In [58]:
get_similar_artist('2pac')

['2pac',
 'dr. dre',
 'snoop dogg',
 'the game',
 '50 cent',
 'notorious b.i.g.',
 'nas',
 'jay-z',
 'ice cube',
 'busta rhymes']

In [60]:
get_similar_artist('lady gaga')

['lady gaga',
 'katy perry',
 'britney spears',
 'rihanna',
 'beyoncé',
 'the pussycat dolls',
 'christina aguilera',
 'leona lewis',
 'justin timberlake',
 'pink']

## 유저에게 아티스트 추천하기

In [63]:
user = user_to_idx['sc']
# recommend에서는 user*item CSR Matrix를 받음
artist_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=False)
artist_recommended

(array([ 376, 3746,  350,  550, 1800,  564,  396,  369, 2249,  274,  354,
         409,   62,  391, 5555,  901,  355,    5,  627,  277], dtype=int32),
 array([0.5538964 , 0.50244755, 0.48266408, 0.47490972, 0.46886438,
        0.46731347, 0.46058226, 0.45981163, 0.44372168, 0.42067707,
        0.41200268, 0.41038087, 0.40974933, 0.40713897, 0.40102652,
        0.40080065, 0.39169136, 0.38622856, 0.38587174, 0.38487607],
       dtype=float32))

In [65]:
[idx_to_artist[i] for i in artist_recommended[0]]

['black eyed peas',
 'jason mraz',
 'rihanna',
 'britney spears',
 'lady gaga',
 'kanye west',
 'beyoncé',
 'justin timberlake',
 'katy perry',
 'michael jackson',
 'nelly furtado',
 'amy winehouse',
 'coldplay',
 'christina aguilera',
 'timbaland',
 'pink',
 'madonna',
 'red hot chili peppers',
 'maroon 5',
 'muse']

In [68]:
rihanna = artist_to_idx['jason mraz']
explain = als_model.explain(user, csr_data, itemid=rihanna)

In [69]:
[(idx_to_artist[i[0]], i[1]) for i in explain[1]]

[('jason mraz', np.float64(0.36319771740026896)),
 ('black eyed peas', np.float64(0.0683760212826993)),
 ('coldplay', np.float64(0.0448427250223399)),
 ('beyoncé', np.float64(0.02623640756587933)),
 ('maroon5', np.float64(8.234267972287348e-05))]

#

**유튜브 뮤직의 첫화면에서 처음 접속한 사용자에게 좋아하는 아티스트 5명 이상의 정보를 요구하는 이유는 무엇일까?**

처음 접속한 사용자라면 이 사용자에 대한 아무런 초기 정보가 없는 상황이어서 사용자 기반 추천을 전혀 할 수 없어 콘텐츠 기반 필터링 방식의 추천만 가능한데, 이를 보완하여 처음부터 사용자 특성에 따른 맞춤형 서비스를 제공하기 위해서입니다.

사용자 맞춤형 추천을 제공하지 못한다면 모든 초기 사용자에게 같은 화면을 제시할 수밖에 없는데, 이러면 모든 사용자들이 처음 추천받은 콘텐츠 위주로만 시청하게 되어 추천시스템이 빠지기 쉬운 **필터 버블**의 문제을 야기하거나, 추천의 다양성을 저해할 수 있는 위험이 생깁니다.

참고자료

https://dl.moazine.com/viewer/index.asp?libraryid=9MtJb2T3nzH3yk7c212iu821DPFsY2ECOso0&a_i=8wwot4affaz3r6syx11VOYm2&keyword=&s_i=9#page/2