In [14]:
# pip install scikit-surprise
import surprise

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

## 1. 데이터 로드 

In [5]:
# MovieLense
data = surprise.Dataset.load_builtin('ml-100k')

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to C:\Users\okso6/.surprise_data/ml-100k


In [8]:
df = pd.DataFrame(data.raw_ratings, columns=["user", "item", "rate", "id"])
del df["id"]
df.head(3)

Unnamed: 0,user,item,rate
0,196,242,3.0
1,186,302,3.0
2,22,377,1.0
3,244,51,2.0
4,166,346,1.0
5,298,474,4.0
6,115,265,2.0
7,253,465,5.0
8,305,451,3.0
9,6,86,3.0


## 2. 전처리 & EDA 

In [9]:
df_table = df.set_index(["user", "item"]).unstack()
df_table.shape

(943, 1682)

In [10]:
df_table.head(2) # sparse matrix

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,1,10,100,1000,1001,1002,1003,1004,1005,1006,...,990,991,992,993,994,995,996,997,998,999
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,5.0,3.0,5.0,,,,,,,,...,,,,,,,,,,
10,4.0,,5.0,,,,,,,,...,,,,,,,,,,


### 결측치 처리 

In [11]:
df_table.iloc[212:222, 808:817].fillna("")

Unnamed: 0_level_0,rate,rate,rate,rate,rate,rate,rate,rate,rate
item,211,212,213,214,215,216,217,218,219
user,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
290,3.0,,,,,4.0,,2.0,
291,,4.0,,4.0,4.0,,,4.0,4.0
292,,,,3.0,,,,,
293,4.0,,3.0,,4.0,4.0,3.0,2.0,
294,,,,,,,,,
295,,,5.0,,5.0,5.0,4.0,5.0,
296,4.0,,,,,,,,
297,4.0,,3.0,,2.0,4.0,,3.0,
298,5.0,,3.0,,5.0,,,,
299,4.0,4.0,5.0,,,5.0,,,


## 3. 학습
### 베이스라인 모형

In [19]:
from surprise.model_selection import cross_validate

# sgd; reg: 정규화 가중치, learning_rate 
bsl_options = {
    'method': 'als', 
    'n_epochs': 5,
    'reg_u': 12, # 사용자에 대한 정규화 가중치. 디폴트는 15.
    'reg_i': 5 # 상품에 대한 정규화 가중치. 디폴트는 10.
}
algo = surprise.BaselineOnly(bsl_options)
cross_validate(algo, data)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...


{'test_rmse': array([0.9384446 , 0.94651657, 0.93612815, 0.94221861, 0.94428787]),
 'test_mae': array([0.74477853, 0.75124267, 0.73975393, 0.745764  , 0.74659098]),
 'fit_time': (0.11500716209411621,
  0.14100313186645508,
  0.15200018882751465,
  0.15700125694274902,
  0.1489582061767578),
 'test_time': (0.07503962516784668,
  0.13699960708618164,
  0.07204055786132812,
  0.147996187210083,
  0.10202622413635254)}

### CF 
#### NN 

In [20]:
sim_options = {'name': 'msd'}
algo = surprise.KNNBasic(sim_options=sim_options)
cross_validate(algo, data)["test_mae"].mean()

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


0.7726801901092284

#### 코사인 유사도

In [21]:
sim_options = {'name': 'cosine'}
algo = surprise.KNNBasic(sim_options=sim_options)
cross_validate(algo, data)["test_mae"].mean()

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


0.8046567723959086

#### 피어슨 유사도

In [22]:
sim_options = {'name': 'pearson'}
algo = surprise.KNNBasic(sim_options=sim_options)
cross_validate(algo, data)["test_mae"].mean()

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


0.8032778978216127

### Latent Factor 
#### SVD 

In [23]:
algo = surprise.SVD(n_factors=100)
cross_validate(algo, data)["test_mae"].mean()

0.7382901597606505

#### NMF

In [24]:
algo = surprise.NMF(n_factors=100)
cross_validate(algo, data)["test_mae"].mean()

0.8376599766948564

참고 출처: https://datascienceschool.net/