# EXPLORATION 09

# 9. 아이유팬이 좋아할 만한 다른 아티스트 찾기

* Writier : 송영석
* Date : 2021.11.02

## 1) 데이터 준비와 전처리

In [240]:
import os
import pandas as pd
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head(100)

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
95,2,2490,3,978299966
96,2,1834,4,978298813
97,2,3471,5,978298814
98,2,589,4,978299773


In [241]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [242]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)
ratings.tail(30)

Unnamed: 0,user_id,movie_id,counts,timestamp
1000170,6040,2728,5,957717123
1000172,6040,1784,3,997454464
1000174,6040,2745,3,956716157
1000178,6040,3703,4,964828575
1000179,6040,2762,4,956704584
1000180,6040,1036,3,956715455
1000181,6040,508,4,956704972
1000182,6040,1041,4,957717678
1000183,6040,3735,4,960971654
1000184,6040,2791,4,956715569


In [243]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.

import re
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')

movies["year"] = movies["title"].apply(lambda i : int(re.sub(r'[^0-9]', '', i)))
movies

Unnamed: 0,movie_id,title,genre,year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
1,2,Jumanji (1995),Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama,1995
4,5,Father of the Bride Part II (1995),Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,2000
3879,3949,Requiem for a Dream (2000),Drama,2000
3880,3950,Tigerland (2000),Drama,2000
3881,3951,Two Family House (2000),Drama,2000


In [244]:
ratings=pd.merge(left = ratings , right = movies, how = "inner", on = "movie_id")
ratings.drop(['genre','year'],axis = 1 ,inplace = True)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts,timestamp,title
836468,5494,3530,4,959816296,Smoking/No Smoking (1993)
836469,5556,2198,3,959445515,Modulations (1998)
836470,5949,2198,5,958846401,Modulations (1998)
836471,5675,2703,3,976029116,Broken Vessels (1998)
836472,5717,2258,4,958509389,Master Ninja I (1984)
836473,5851,3607,5,957756608,One Little Indian (1973)
836474,5854,3026,4,958346883,Slaughterhouse (1987)
836475,5854,690,3,957744257,"Promise, The (Versprechen, Das) (1994)"
836476,5938,2909,4,957273353,"Five Wives, Three Secretaries and Me (1998)"
836477,5948,1360,5,1016563709,Identification of a Woman (Identificazione di ...


In [245]:
#[[ your code ]]
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

print(user_to_idx[6039])    # 358869명의 유저 중 마지막으로 추가된 유저이니 358868이 나와야 합니다. 
print(movie_to_idx[1360])

# indexing을 통해 데이터 컬럼 내 값을 바꾸는 코드
# dictionary 자료형의 get 함수는 https://wikidocs.net/16 을 참고하세요.

# user_to_idx.get을 통해 user_id 컬럼의 모든 값을 인덱싱한 Series를 구해 봅시다. 
# 혹시 정상적으로 인덱싱되지 않은 row가 있다면 인덱스가 NaN이 될 테니 dropna()로 제거합니다. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

1940
3627
user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp,title
0,0,0,5,978300760,One Flew Over the Cuckoo's Nest (1975)
1,1,0,5,978298413,One Flew Over the Cuckoo's Nest (1975)
2,2,0,4,978220179,One Flew Over the Cuckoo's Nest (1975)
3,3,0,4,978199279,One Flew Over the Cuckoo's Nest (1975)
4,4,0,5,978158471,One Flew Over the Cuckoo's Nest (1975)
...,...,...,...,...,...
836473,1621,3623,5,957756608,One Little Indian (1973)
836474,3481,3624,4,958346883,Slaughterhouse (1987)
836475,3481,3625,3,957744257,"Promise, The (Versprechen, Das) (1994)"
836476,4159,3626,4,957273353,"Five Wives, Three Secretaries and Me (1998)"


## 2)분석해 봅시다

In [246]:
#[[ your code ]]#[[ your code ]]
print(type(ratings['user_id'][0]))
# 영화 수
ratings['title'].nunique()



<class 'numpy.int64'>


3628

In [247]:

# 유저 수
ratings['user_id'].nunique()


6039

In [248]:
# 인기 많은 영화
movie_count = ratings.groupby('movie_id')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

movie_id
99     3211
44     2910
117    2885
64     2716
48     2561
92     2509
121    2498
120    2473
22     2460
124    2434
107    2413
38     2385
51     2371
87     2314
175    2297
23     2257
5      2252
126    2213
224    2210
157    2194
607    2167
110    2121
26     2102
170    2066
243    2051
222    2030
160    2022
200    2019
40     2000
141    1941
Name: user_id, dtype: int64

## 3) 내가 선호하는 영화를 5가지 골라서 ratings에 추가해 줍시다.

In [255]:
# 본인이 좋아하시는 아티스트 데이터로 바꿔서 추가하셔도 됩니다! 단, 이름은 꼭 데이터셋에 있는 것과 동일하게 맞춰주세요. 
my_favorite_name = ["One Flew Over the Cuckoo's Nest (1975)",'One Little Indian (1973)','Slaughterhouse (1987)' ,'Promise, The (Versprechen, Das) (1994)','Five Wives, Three Secretaries and Me (1998)']
my_favorite = [1193,3607,3026,690,2909]

# 'young'이라는 user_id가 위 영화를 5점씩 주었다고 가정하겠습니다.
my_playlist = pd.DataFrame({'user_id': ['young']*5, 'movie_id': my_favorite,'title':my_favorite_name ,'counts':[5]*5})

if not ratings.isin({'user_id':['young']})['user_id'].any():  # user_id에 'zimin'이라는 데이터가 없다면
    ratings = ratings.append(my_playlist)  
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')

# artist_to_idx을 통해 artist 컬럼도 동일한 방식으로 인덱싱해 줍니다. 
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie column indexing Fail!!')

ratings

ratings.tail(10)


user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp,title
0,6039,1193,5,,One Flew Over the Cuckoo's Nest (1975)
1,6039,3607,5,,One Little Indian (1973)
2,6039,3026,5,,Slaughterhouse (1987)
3,6039,690,5,,"Promise, The (Versprechen, Das) (1994)"
4,6039,2909,5,,"Five Wives, Three Secretaries and Me (1998)"
0,6040,1193,5,,One Flew Over the Cuckoo's Nest (1975)
1,6040,3607,5,,One Little Indian (1973)
2,6040,3026,5,,Slaughterhouse (1987)
3,6040,690,5,,"Promise, The (Versprechen, Das) (1994)"
4,6040,2909,5,,"Five Wives, Three Secretaries and Me (1998)"


## 4) CSR matrix를 직접 만들어 봅시다.

In [256]:
#[[ your code ]]
#[[ your code ]]

from scipy.sparse import csr_matrix


num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()
print(num_user)
print(len(ratings['user_id']))
print(num_movie)

print(len(ratings.counts))
csr_data = csr_matrix((ratings.counts, (ratings.user_id  , ratings.movie_id)), shape = (num_user, num_movie))



6041
836488
3628
836488


## 5) als_model = AlternatingLeastSquares 모델을 직접 구성하여 훈련시켜 봅시다.

In [257]:
#[[ your code ]]
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

## 6) 내가 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도를 파악해 보세요.

In [259]:
# 내 선호도 벡터
vector_6039 = user_to_idx['young']
vector_6039 = als_model.user_factors[vector_6039]

In [274]:
#my_favorite = [1193,3607,3026,690,2909]
# 선호도 중 영화 하나 벡터
vector_1193 = movie_to_idx[1193]
vector_1193 = als_model.item_factors[vector_1193]


In [275]:
np.dot(vector_6039, vector_1193)

0.24149477

* 사용자와 아이템 벡터 내적수치가 의미있게 형성되었다.

## 7) 내가 좋아하는 영화와 비슷한 영화를 추천받아 봅시다.

In [276]:
favorite_movie = 1193
similar_movie = als_model.similar_items(1193, N=15)
similar_movie

[(1193, 1.0000001),
 (1067, 0.7120524),
 (1331, 0.6174883),
 (1147, 0.59569347),
 (1155, 0.5912552),
 (946, 0.57553774),
 (564, 0.57485884),
 (169, 0.56524855),
 (504, 0.5596797),
 (1391, 0.5506435),
 (961, 0.53628755),
 (553, 0.5279422),
 (13, 0.525114),
 (163, 0.5215399),
 (1144, 0.5124436)]

In [277]:
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
l_list=[idx_to_movie[i[0]] for i in similar_movie]
l_list

[1193,
 1067,
 1331,
 1147,
 1155,
 946,
 564,
 169,
 504,
 1391,
 961,
 553,
 13,
 163,
 1144]

* 추천 제목 출력

In [278]:
for i in range(0,len(l_list)):
    print(movies[movies['movie_id']==l_list[i]]['title'])

1176    One Flew Over the Cuckoo's Nest (1975)
Name: title, dtype: object
1053    Damsel in Distress, A (1937)
Name: title, dtype: object
1311    Audrey Rose (1977)
Name: title, dtype: object
1131    When We Were Kings (1996)
Name: title, dtype: object
1139    Invitation, The (Zaproszenie) (1986)
Name: title, dtype: object
934    To Be or Not to Be (1942)
Name: title, dtype: object
560    Chasers (1994)
Name: title, dtype: object
167    Free Willy 2: The Adventure Home (1995)
Name: title, dtype: object
500    No Escape (1994)
Name: title, dtype: object
1370    Mars Attacks! (1996)
Name: title, dtype: object
949    Little Lord Fauntleroy (1936)
Name: title, dtype: object
549    Tombstone (1993)
Name: title, dtype: object
12    Balto (1995)
Name: title, dtype: object
161    Desperado (1995)
Name: title, dtype: object
1128    Line King: Al Hirschfeld, The (1996)
Name: title, dtype: object


## 8) 내가 가장 좋아할 만한 영화들을 추천받아 봅시다.

In [279]:
#[[ your code ]]
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]
user = user_to_idx['young']

# recommend에서는 user*item CSR Matrix를 받습니다.
movie_recommended = als_model.recommend(user, csr_data, N=15, filter_already_liked_items=True)
movie_recommended

[(1067, 0.14951903),
 (961, 0.1402563),
 (1144, 0.13679396),
 (410, 0.12336925),
 (163, 0.11069439),
 (169, 0.10479815),
 (112, 0.10284999),
 (1391, 0.09768808),
 (1275, 0.09483361),
 (119, 0.09478698),
 (1147, 0.094153754),
 (504, 0.09357867),
 (655, 0.09203082),
 (1000, 0.09184674),
 (1331, 0.09173066)]

In [280]:
# 내가 선호하는 영화들과 비슷한 영화들
#"One Flew Over the Cuckoo's Nest (1975)",'One Little Indian (1973)','Slaughterhouse (1987)' ,'Promise, The (Versprechen, Das) (1994)','Five Wives, Three Secretaries and Me (1998)']
m_dict=[idx_to_movie[i[0]] for i in movie_recommended]
print(m_dict)
for i in range(0,len(m_dict)):
    print(movies[movies['movie_id']==m_dict[i]]['title'])

[1067, 961, 1144, 410, 163, 169, 112, 1391, 1275, 119, 1147, 504, 655, 1000, 1331]
1053    Damsel in Distress, A (1937)
Name: title, dtype: object
949    Little Lord Fauntleroy (1936)
Name: title, dtype: object
1128    Line King: Al Hirschfeld, The (1996)
Name: title, dtype: object
406    Addams Family Values (1993)
Name: title, dtype: object
161    Desperado (1995)
Name: title, dtype: object
167    Free Willy 2: The Adventure Home (1995)
Name: title, dtype: object
110    Rumble in the Bronx (1995)
Name: title, dtype: object
1370    Mars Attacks! (1996)
Name: title, dtype: object
1255    Highlander (1986)
Name: title, dtype: object
117    Steal Big, Steal Little (1995)
Name: title, dtype: object
1131    When We Were Kings (1996)
Name: title, dtype: object
500    No Escape (1994)
Name: title, dtype: object
649    Mutters Courage (1995)
Name: title, dtype: object
987    Curdled (1996)
Name: title, dtype: object
1311    Audrey Rose (1977)
Name: title, dtype: object


In [281]:
movie_910 = movie_to_idx[898]
explain = als_model.explain(user, csr_data, itemid = movie_910)

In [282]:
l_dict=[(idx_to_movie[i[0]], i[1]) for i in explain[1]]
l_dict

[(1193, 0.015549563290544202),
 (690, 0.007879980411913855),
 (2909, -3.3664322045618005e-05),
 (3607, -0.0009322632338305766),
 (3026, -0.0018900356709840695)]

In [283]:
k_list=[idx_to_movie[i[0]] for i in l_dict]
print(k_list)


[1193, 690, 2909, 3607, 3026]


In [284]:
for i in range(0,len(k_list)):
    print(movies[movies['movie_id']==k_list[i]]['title'])

1176    One Flew Over the Cuckoo's Nest (1975)
Name: title, dtype: object
681    Promise, The (Versprechen, Das) (1994)
Name: title, dtype: object
2840    Five Wives, Three Secretaries and Me (1998)
Name: title, dtype: object
3538    One Little Indian (1973)
Name: title, dtype: object
2957    Slaughterhouse (1987)
Name: title, dtype: object


# 회고

1. CSR matrix를 만들었지만 정확한 원리를 이해하지 못하고 있는거 같아서 아쉬웠습니다. 특히 indexing을 하고 안하고에 있어서 오류 발생 차이가 있는데 이유를 알기 어려웠습니다.
2. csr 함수에서 shape을 지워주면 고정크기가 없으므로 작동 하였지만 뒤의 경우를 위해 indexing을 해주었습니다.
3. dot product를 통해 내가 선호하는 영화의 내적 값이 큰 것을 보아 제대로 학습 되었다는 것을 알 수 있었습니다.
4. One Flew Over the Cuckoo's Nest (1975)가 Damsel in Distress, A (1937)추천에 가장 큰 기여를 하였습니다.
