In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
r_df = pd.read_csv("/Users/tural/Datasets/ml-20m/ratings.csv")
r_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
def clean_df(df, min_rating_per_uid, min_rating_per_iid):
    """Функция удаляет из df строки, соответствующие пользователям и объектам, 
    у которых меньше min_review_per_uid и min_review_per_iid отзывов соответственно
    """
    _df = df.copy()
    while True:
        rating_per_uid = _df.groupby("userId").rating.count()
        bad_uids = rating_per_uid[rating_per_uid < min_rating_per_uid].index
    
        rating_per_iid = _df.groupby("movieId").rating.count()
        bad_iids = rating_per_iid[rating_per_iid < min_rating_per_iid].index
        
        if bad_uids.shape[0] > 0 or bad_iids.shape[0] > 0:
            _df = _df[(~_df.userId.isin(bad_uids)) & (~_df.movieId.isin(bad_iids))]
        else:
            break
    return _df

In [4]:
print(r_df.shape)
r_df = clean_df(r_df, 20, 20)
print(r_df.shape)

(20000263, 4)
(19931488, 4)


In [5]:
from scipy.sparse import csr_matrix

def load_data(df):
    rows = []
    cols = []
    data = []
    
    uid_to_row = {}
    iid_to_col = {}
    
    for t in df.itertuples():
        row_id = uid_to_row.setdefault(t.userId, len(uid_to_row))
        col_id = iid_to_col.setdefault(t.movieId, len(iid_to_col))
        rating = t.rating
        
        rows.append(row_id)
        cols.append(col_id)
        data.append(rating)
        
    ui_m = csr_matrix((data, (rows, cols)))
    return ui_m, uid_to_row, iid_to_col

In [6]:
ui_m, uid_to_row, iid_to_col = load_data(r_df)
del r_df

In [7]:
print("Density", ui_m.nnz / (ui_m.shape[0] * ui_m.shape[1]))
print("Max rating", ui_m.data.max())
print("Min rating", ui_m.data.min())
print("Shape", ui_m.shape)

Density 0.010967655568293887
Max rating 5.0
Min rating 0.5
Shape (138408, 13130)


## Significance weightning

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
ii_sim_m = cosine_similarity(ui_m.T, dense_output=False).tocsr()

In [9]:
print("Density", ii_sim_m.nnz / (ii_sim_m.shape[0] * ii_sim_m.shape[1]))
print("Max sim", ii_sim_m.data.max())
print("Min sim", ii_sim_m.data.min())
print("Shape", ii_sim_m.shape)

Density 0.8909167856266557
Max sim 1.0
Min sim 1.64361107679e-05
Shape (13130, 13130)


In [10]:
from sklearn.preprocessing import binarize
intersection_cnt = binarize(ui_m.T).dot(ui_m).tocsr()

In [11]:
ii_sim_m.nnz, intersection_cnt.nnz

(153591292, 153591292)

$w'_{ij} = \frac{min\{|U_{ij}|, \gamma\}}{\gamma}w_{ij}$

In [12]:
gamma = 40
intersection_cnt.data /= 40
intersection_cnt.data[intersection_cnt.data > 1] = 1

In [13]:
ii_sim_m = ii_sim_m.multiply(intersection_cnt)
del intersection_cnt

In [14]:
def nullify_main_diagonal(m):
    positions = range(m.shape[0])
    eye = csr_matrix((np.ones(len(positions)), (positions, positions)), m.shape)
    return m - m.multiply(eye)

In [15]:
ii_sim_m = nullify_main_diagonal(ii_sim_m)

In [16]:
print("Density", ii_sim_m.nnz / (ii_sim_m.shape[0] * ii_sim_m.shape[1]))
print("Max sim", ii_sim_m.data.max())
print("Min sim", ii_sim_m.data.min())
print("Shape", ii_sim_m.shape)

Density 0.8908406241643556
Max sim 0.889068217233
Min sim 2.05451384599e-07
Shape (13130, 13130)
