# Нормализация рейтингов

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
r_df = pd.read_csv("/Users/tural/Datasets/ml-20m/ratings.csv")
r_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


# Минимальное количество рейтингов для пользователя и объекта

In [7]:
def clean_df(df, min_rating_per_uid, min_rating_per_iid):
    """Функция удаляет из df строки, соответствующие пользователям и объектам, 
    у которых меньше min_review_per_uid и min_review_per_iid отзывов соответственно
    """
    _df = df.copy()
    while True:
        rating_per_uid = _df.groupby("userId").rating.count()
        bad_uids = rating_per_uid[rating_per_uid < min_rating_per_uid].index
    
        rating_per_iid = _df.groupby("movieId").rating.count()
        bad_iids = rating_per_iid[rating_per_iid < min_rating_per_iid].index
        
        if bad_uids.shape[0] > 0 or bad_iids.shape[0] > 0:
            _df = _df[(~_df.userId.isin(bad_uids)) & (~_df.movieId.isin(bad_iids))]
        else:
            break
    return _df

In [9]:
print(r_df.shape)
r_df = clean_df(r_df, 10, 10)
print(r_df.shape)

(19984024, 4)
(19964833, 4)


## Построение матрицы

In [10]:
from scipy.sparse import csr_matrix

def load_data(df):
    rows = []
    cols = []
    data = []
    
    uid_to_row = {}
    iid_to_col = {}
    
    for t in df.itertuples():
        row_id = uid_to_row.setdefault(t.userId, len(uid_to_row))
        col_id = iid_to_col.setdefault(t.movieId, len(iid_to_col))
        rating = t.rating
        
        rows.append(row_id)
        cols.append(col_id)
        data.append(rating)
        
    ui_m = csr_matrix((data, (rows, cols)))
    return ui_m, uid_to_row, iid_to_col

In [11]:
ui_m, uid_to_row, iid_to_col = load_data(r_df)

In [12]:
print("Density", ui_m.nnz / (ui_m.shape[0] * ui_m.shape[1]))
print("Max rating", ui_m.data.max())
print("Min rating", ui_m.data.min())
print("Shape", ui_m.shape)

Density 0.009329991891886498
Max rating 5.0
Min rating 0.5
Shape (138493, 15451)


## Mean-centering (User-based CF)

$h(r_{ui}) = r_{ui} - \bar{r}_u$

In [16]:
from sklearn.preprocessing import binarize

In [17]:
# мы хотим увидеть средний среди существующих рейтингов
user_avg_rating = ui_m.sum(axis=1) / binarize(ui_m).sum(axis=1)
user_avg_rating

matrix([[ 3.74285714],
        [ 4.        ],
        [ 4.12299465],
        ..., 
        [ 2.68181818],
        [ 4.09756098],
        [ 4.17292225]])

In [18]:
n_ui_m = ui_m - binarize(ui_m).multiply(user_avg_rating)

In [20]:
print("Density", n_ui_m.nnz / (n_ui_m.shape[0] * n_ui_m.shape[1]))
print("Max rating", n_ui_m.data.max())
print("Min rating", n_ui_m.data.min())
print("Shape", n_ui_m.shape)

Density 0.009310822839111887
Max rating 4.28571428571
Min rating -4.3932238193
Shape (138493, 15451)


In [21]:
ui_m[0].data

array([ 3.5,  3.5,  3.5,  3.5,  3.5,  3.5,  4. ,  4. ,  4. ,  4. ,  4. ,
        4. ,  4. ,  3.5,  3.5,  4. ,  3.5,  3.5,  3. ,  3.5,  3.5,  3.5,
        4. ,  4. ,  3.5,  3.5,  4. ,  4. ,  3.5,  3.5,  4.5,  4.5,  4. ,
        3. ,  3.5,  4. ,  4. ,  3.5,  4. ,  3.5,  4. ,  3. ,  3.5,  4. ,
        4. ,  4. ,  3.5,  3.5,  4. ,  4. ,  3.5,  3. ,  4. ,  4. ,  3.5,
        3.5,  4. ,  3. ,  4. ,  4. ,  3. ,  3.5,  3.5,  3.5,  3.5,  4. ,
        3.5,  3.5,  4. ,  4. ,  4. ,  4. ,  4. ,  4. ,  4. ,  4. ,  4. ,
        3.5,  3.5,  4. ,  4. ,  4. ,  4. ,  3.5,  3.5,  3.5,  3.5,  3.5,
        3.5,  3. ,  4. ,  3.5,  4. ,  3.5,  4. ,  3.5,  4. ,  4. ,  3.5,
        3. ,  3.5,  4. ,  4. ,  3.5,  3.5,  3.5,  4. ,  4. ,  4. ,  4. ,
        3. ,  4. ,  3.5,  4. ,  4. ,  3.5,  4. ,  3. ,  3.5,  4. ,  3.5,
        4. ,  4. ,  3.5,  4. ,  3.5,  4. ,  4. ,  3. ,  3.5,  3.5,  5. ,
        4. ,  4. ,  3. ,  3.5,  4. ,  4. ,  3.5,  4. ,  4. ,  3.5,  5. ,
        3.5,  4. ,  3.5,  4. ,  3.5,  4. ,  4. ,  3

In [22]:
n_ui_m[0].data

array([-0.24285714, -0.24285714, -0.24285714, -0.24285714, -0.24285714,
       -0.24285714,  0.25714286,  0.25714286,  0.25714286,  0.25714286,
        0.25714286,  0.25714286,  0.25714286, -0.24285714, -0.24285714,
        0.25714286, -0.24285714, -0.24285714, -0.74285714, -0.24285714,
       -0.24285714, -0.24285714,  0.25714286,  0.25714286, -0.24285714,
       -0.24285714,  0.25714286,  0.25714286, -0.24285714, -0.24285714,
        0.75714286,  0.75714286,  0.25714286, -0.74285714, -0.24285714,
        0.25714286,  0.25714286, -0.24285714,  0.25714286, -0.24285714,
        0.25714286, -0.74285714, -0.24285714,  0.25714286,  0.25714286,
        0.25714286, -0.24285714, -0.24285714,  0.25714286,  0.25714286,
       -0.24285714, -0.74285714,  0.25714286,  0.25714286, -0.24285714,
       -0.24285714,  0.25714286, -0.74285714,  0.25714286,  0.25714286,
       -0.74285714, -0.24285714, -0.24285714, -0.24285714, -0.24285714,
        0.25714286, -0.24285714, -0.24285714,  0.25714286,  0.25

## Z-score (Item-based CF)

$\sigma_i = \sqrt{\frac{\sum_{u \in U_i}(r_{ui} - \bar{r}_i)^2}{|U_i|}}$

In [23]:
item_avg_rating = ui_m.sum(axis=0) / binarize(ui_m).sum(axis=0)
item_avg_rating

matrix([[ 3.2119768 ,  3.95223005,  3.89805469, ...,  3.55      ,
          4.04545455,  3.25      ]])

In [40]:
def get_sigmas(ui_m, item_avg_rating):
    n_ui_m = ui_m - binarize(ui_m).multiply(item_avg_rating)  # r_{ui} - \bar{r}_i
    n_ui_m.data = n_ui_m.data ** 2  # (r_{ui} - \bar{r}_i)^2
    n_ui_m_sum = n_ui_m.sum(axis=0)  # \sum(r_{ui} - \bar{r}_i)^2
    ratings_per_item = binarize(ui_m).sum(axis=0)  # |U_{i}|
    sigmas = np.sqrt(n_ui_m_sum / ratings_per_item)
    return sigmas

$h(r_{ui}) = \frac{r_{ui} - \bar{r}_i}{\sigma_i}$

In [41]:
sigmas = get_sigmas(ui_m, item_avg_rating)
sigmas

matrix([[ 0.95112853,  0.97558563,  0.86712575, ...,  0.47169906,
          0.44997704,  0.71589105]])

In [52]:
inv_sigmas = 1 / sigmas
z_score = (ui_m - binarize(ui_m).multiply(item_avg_rating)).multiply(inv_sigmas)

In [53]:
print("Density", z_score.nnz / (z_score.shape[0] * z_score.shape[1]))
print("Max rating", z_score.data.max())
print("Min rating", z_score.data.min())
print("Shape", z_score.shape)

Density 0.00932873900345702
Max rating 5.33020596972
Min rating -5.50088634047
Shape (138493, 15451)


## Проверка

In [54]:
i_ratings = ui_m.getcol(0).data
i_ratings

array([ 3.5,  3. ,  3. , ...,  3. ,  3. ,  4. ])

In [55]:
avg_rating = np.mean(i_ratings)
avg_rating

3.2119768016904193

In [56]:
sigma = np.std(i_ratings)
sigma

0.95112852894804079

In [57]:
(ui_m[0, 0] - avg_rating) / sigma

0.30282258342953677

In [58]:
z_score.tocsr()[0, 0]

0.3028225834295224