[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oreilly-japan/RecommenderSystems/blob/main/chapter5/colab/NMF.ipynb)

# 비음수 행렬 분해(Non-negative Matrix Factorization, NMF)

In [1]:
# Colab용 notebook입니다. 이 notebook 한 장에서 여러 데이터의 다운로드부터, 추천까지 완결하도록 되어 있습니다(예측 평가는 미포함)
# MovieLens 데이터를 아직 다운로드 하지 않았다면, 이 셀을 실행해서 다운로드합니다.
# MovieLens 데이터 분석은 data_download.ipynb를 참조합니다.

# 데이터 다운로드와 압축 풀기
!wget -nc --no-check-certificate https://files.grouplens.org/datasets/movielens/ml-10m.zip -P ../data
!unzip -n ../data/ml-10m.zip -d ../data/

--2022-12-27 05:36:13--  https://files.grouplens.org/datasets/movielens/ml-10m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 65566137 (63M) [application/zip]
Saving to: ‘../data/ml-10m.zip’


2022-12-27 05:36:14 (64.1 MB/s) - ‘../data/ml-10m.zip’ saved [65566137/65566137]

Archive:  ../data/ml-10m.zip
   creating: ../data/ml-10M100K/
  inflating: ../data/ml-10M100K/allbut.pl  
  inflating: ../data/ml-10M100K/movies.dat  
  inflating: ../data/ml-10M100K/ratings.dat  
  inflating: ../data/ml-10M100K/README.html  
  inflating: ../data/ml-10M100K/split_ratings.sh  
  inflating: ../data/ml-10M100K/tags.dat  


In [2]:
# Movielens 데이터 로딩(데이터량이 많으므로, 로딩에 시간이 걸릴 수 있습니다)
import pandas as pd

# movieID와 제목만 사용
m_cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv('../data/ml-10M100K/movies.dat', names=m_cols, sep='::' , encoding='latin-1', engine='python')

# genre를 list 형식으로 저장한다
movies['genre'] = movies.genre.apply(lambda x:x.split('|'))


# 사용자가 부여한 영화의 태그 정보를 로딩한다
t_cols = ['user_id', 'movie_id', 'tag', 'timestamp']
user_tagged_movies = pd.read_csv('../data/ml-10M100K/tags.dat', names=t_cols, sep='::', engine='python')

# tag를 소문자로 바꾼다
user_tagged_movies['tag'] = user_tagged_movies['tag'].str.lower()


# tag를 영화별로 list 형식으로 저장한다
movie_tags = user_tagged_movies.groupby('movie_id').agg({'tag':list})

# 태그 정보를 결합한다
movies = movies.merge(movie_tags, on='movie_id', how='left')

# 평갓값 데이터만 로딩한다
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../data/ml-10M100K/ratings.dat', names=r_cols, sep='::', engine='python')


# 데이터량이 많으므로 사용자수를 1000으로 줄여서 시험해본다
valid_user_ids = sorted(ratings.user_id.unique())[:1000]
ratings = ratings[ratings["user_id"].isin(valid_user_ids)]


# 영화 데이터와 평가 데이터를 결합한다
movielens = ratings.merge(movies, on='movie_id')

print(f'unique_users={len(movielens.user_id.unique())}, unique_movies={len(movielens.movie_id.unique())}')

# 학습용과 데이터용으로 데이터를 나눈다
# 각 사용자의 최근 5건의 영화를 평가용으로 사용하고, 나머지는 학습용으로 사용한다
# 우선, 각 사용자가 평가한 영화의 순서를 계산한다
# 최근 부여한 영화부터 순서를 부여한다(1에서 시작)

movielens['timestamp_rank'] = movielens.groupby(
    'user_id')['timestamp'].rank(ascending=False, method='first')
movielens_train = movielens[movielens['timestamp_rank'] > 5]
movielens_test = movielens[movielens['timestamp_rank']<= 5]

unique_users=1000, unique_movies=6736


In [3]:
# 결손값을 채우는 방법
fillna_with_zero = True
# 인자 수
factors = 5

In [4]:
# 평갓값을 사용자 x 영화 행렬로 변환한다. 결손값은 평균값 또는 0으로 채운다.
user_movie_matrix = movielens_train.pivot(index="user_id", columns="movie_id", values="rating")
user_id2index = dict(zip(user_movie_matrix.index, range(len(user_movie_matrix.index))))
movie_id2index = dict(zip(user_movie_matrix.columns, range(len(user_movie_matrix.columns))))
if fillna_with_zero:
    matrix = user_movie_matrix.fillna(0).to_numpy()
else:
    matrix = user_movie_matrix.fillna(movielens_train.rating.mean()).to_numpy()
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 3., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [5]:
from sklearn.decomposition import NMF

# NMF 실행
nmf = NMF(n_components=factors)
nmf.fit(matrix)




NMF(n_components=5)

In [6]:
P = nmf.fit_transform(matrix)
P



array([[0.02183306, 0.53589472, 0.        , 0.        , 0.        ],
       [0.21316649, 0.10881363, 0.        , 0.        , 0.        ],
       [0.        , 0.02976153, 0.05290897, 0.15364729, 0.02014201],
       ...,
       [0.13635649, 0.14281654, 0.        , 0.        , 0.00656846],
       [0.09753435, 0.01077205, 0.        , 0.13237952, 0.13516597],
       [0.99758645, 0.        , 0.        , 0.64704016, 0.0597298 ]])

In [7]:
Q = nmf.components_
Q

array([[1.96418272, 0.50117827, 0.39871676, ..., 0.        , 0.        ,
        0.        ],
       [1.15067071, 0.96077987, 0.40196914, ..., 0.        , 0.00289232,
        0.00216924],
       [0.47176639, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.79016059, 0.30417491, 0.        , ..., 0.02803924, 0.01446954,
        0.01085215],
       [0.29616062, 0.30060155, 0.17846866, ..., 0.        , 0.        ,
        0.        ]])

In [8]:
import numpy as np
# 예측 평갓값 행렬
pred_matrix = np.dot(P, Q)
pred_matrix

array([[6.59522475e-01, 5.25819116e-01, 2.24118350e-01, ...,
        0.00000000e+00, 1.54997784e-03, 1.16248338e-03],
       [5.43906588e-01, 2.11380357e-01, 1.28732773e-01, ...,
        0.00000000e+00, 3.14723598e-04, 2.36042698e-04],
       [1.86577692e-01, 8.13846441e-02, 1.55579328e-02, ...,
        4.30815294e-03, 2.30928521e-03, 1.73196391e-03],
       ...,
       [4.34109190e-01, 2.07528655e-01, 1.12947724e-01, ...,
        0.00000000e+00, 4.13070817e-04, 3.09803112e-04],
       [3.48602292e-01, 1.40129296e-01, 6.73415036e-02, ...,
        3.71182091e-03, 1.94662680e-03, 1.45997010e-03],
       [2.48839731e+00, 7.14736905e-01, 4.08414332e-01, ...,
        1.81425136e-02, 9.36237290e-03, 7.02177967e-03]])

In [9]:
# 학습용에 나타나지 않는 사용자나 영화의 예측 평갓값은 평균 평갓값으로 한다
average_score = movielens_train.rating.mean()
movie_rating_predict = movielens_test.copy()
pred_results = []
for i, row in movielens_test.iterrows():
    user_id = row["user_id"]
    if user_id not in user_id2index or row["movie_id"] not in movie_id2index:
        pred_results.append(average_score)
        continue
    user_index = user_id2index[row["user_id"]]
    movie_index = movie_id2index[row["movie_id"]]
    pred_score = pred_matrix[user_index, movie_index]
    pred_results.append(pred_score)
movie_rating_predict["rating_pred"] = pred_results

In [10]:
from collections import defaultdict

# 각 사용자에 대한 추천 영화는 해당 사용자가 아직 평가하지 않은 영화 중에서 예측값이 높은 순으로 한다
pred_user2items = defaultdict(list)
user_evaluated_movies = movielens_train.groupby("user_id").agg({"movie_id": list})["movie_id"].to_dict()
for user_id in movielens_train.user_id.unique():
    if user_id not in user_id2index:
        continue
    user_index = user_id2index[row["user_id"]]
    movie_indexes = np.argsort(-pred_matrix[user_index, :])
    for movie_index in movie_indexes:
        movie_id = user_movie_matrix.columns[movie_index]
        if movie_id not in user_evaluated_movies[user_id]:
            pred_user2items[user_id].append(movie_id)
        if len(pred_user2items[user_id]) == 10:
            break
pred_user2items

defaultdict(list,
            {139: [4993, 5952, 4226, 7153, 4306, 3996, 6539, 5445, 6874, 4963],
             149: [4226, 2571, 2762, 47, 3996, 318, 6539, 5445, 4963, 1704],
             182: [47, 6874, 2329, 3793, 4011, 7361, 4878, 7438, 6333, 2502],
             215: [2329, 527, 6711, 3147, 2683, 8360, 5669, 6502, 3948, 1246],
             281: [2959, 4993, 5952, 4226, 2571, 2858, 7153, 3578, 2762, 4306],
             326: [2959, 4993, 5952, 4226, 2571, 2858, 7153, 3578, 2762, 47],
             351: [2959, 4993, 5952, 4226, 7153, 3578, 2762, 47, 4306, 3996],
             357: [2959, 4993, 5952, 4226, 2858, 7153, 3578, 2762, 296, 47],
             426: [2502,
              33794,
              3949,
              7147,
              33493,
              5902,
              44191,
              33166,
              4848,
              6016],
             456: [2959, 4993, 5952, 4226, 2571, 2858, 7153, 3578, 2762, 4306],
             459: [318, 4886, 2997, 50, 4995, 4973, 7361, 4878, 3

In [11]:
# user_id=2의 사용자가 학습 데이터에 평가를 부여한 영화 목록
movielens_train[movielens_train.user_id==2]

Unnamed: 0,user_id,movie_id,rating,timestamp,title,genre,tag,timestamp_rank
4732,2,110,5.0,868245777,Braveheart (1995),"[Action, Drama, War]","[bullshit history, medieval, bloodshed, hero, ...",8.0
5246,2,260,5.0,868244562,Star Wars: Episode IV - A New Hope (a.k.a. Sta...,"[Action, Adventure, Sci-Fi]","[desert, quotable, lucas, gfei own it, seen mo...",17.0
5798,2,590,5.0,868245608,Dances with Wolves (1990),"[Adventure, Drama, Western]","[afi 100, lame, native, biopic, american india...",11.0
6150,2,648,2.0,868244699,Mission: Impossible (1996),"[Action, Adventure, Mystery, Thriller]","[confusing, confusing plot, memorable sequence...",12.0
6531,2,733,3.0,868244562,"Rock, The (1996)","[Action, Adventure, Thriller]","[gfei own it, alcatraz, nicolas cage, sean con...",18.0
6813,2,736,3.0,868244698,Twister (1996),"[Action, Adventure, Romance, Thriller]","[disaster, disaster, storm, bill paxton, helen...",13.0
7113,2,780,3.0,868244698,Independence Day (a.k.a. ID4) (1996),"[Action, Adventure, Sci-Fi, War]","[action, alien invasion, aliens, will smith, a...",14.0
7506,2,786,3.0,868244562,Eraser (1996),"[Action, Drama, Thriller]","[arnold schwarzenegger, action, arnold, arnold...",19.0
7661,2,802,2.0,868244603,Phenomenon (1996),"[Drama, Romance]","[interesting concept, own, john travolta, john...",15.0
7779,2,858,2.0,868245645,"Godfather, The (1972)","[Crime, Drama]","[oscar (best picture), marlon brando, classic,...",9.0


In [12]:
pred_user2items[2]

[2959, 4993, 5952, 4226, 2571, 2858, 7153, 3578, 2762, 296]

In [13]:
# user_id=2에 대한 추천(2959, 4993, 5952)
movies[movies.movie_id.isin([2959, 4993, 5952])]

Unnamed: 0,movie_id,title,genre,tag
2874,2959,Fight Club (1999),"[Action, Crime, Drama, Thriller]","[based on a book, chuck palahniuk, edward nort..."
4899,4993,"Lord of the Rings: The Fellowship of the Ring,...","[Action, Adventure, Fantasy]","[based on a book, big budget, new zealand, sce..."
5852,5952,"Lord of the Rings: The Two Towers, The (2002)","[Action, Adventure, Fantasy]","[based on a book, big budget, new zealand, sce..."
