In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import dates
%matplotlib inline
plt.rcParams["figure.figsize"] = [30, 4]
import seaborn as sns

import json
from tqdm import tqdm_notebook

from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer

In [2]:
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
df = getDF('Video_Games_5.json.gz')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497577 entries, 0 to 497576
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   overall         497577 non-null  float64
 1   verified        497577 non-null  bool   
 2   reviewTime      497577 non-null  object 
 3   reviewerID      497577 non-null  object 
 4   asin            497577 non-null  object 
 5   reviewerName    497501 non-null  object 
 6   reviewText      497419 non-null  object 
 7   summary         497468 non-null  object 
 8   unixReviewTime  497577 non-null  int64  
 9   vote            107793 non-null  object 
 10  style           289237 non-null  object 
 11  image           3634 non-null    object 
dtypes: bool(1), float64(1), int64(1), object(9)
memory usage: 46.0+ MB


In [5]:
df.sort_values(by=["unixReviewTime"], ignore_index=True, inplace=True)

In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,False,"10 14, 1999",A2AXQTB83VMK4L,B0000296O5,Amazon Customer,I'm having the most fun I've ever had on PlayS...,Best RPG Ever!,939859200,,{'Format:': ' Video Game'},
1,4.0,False,"11 5, 1999",A2T04VAIXSKJH2,B00002NDRY,Stefan,I'm usually not crazy about real-time strategy...,Good real time strategy game,941760000,3.0,,
2,4.0,False,"11 10, 1999",AMGJMFJ63DWWH,B000021XYY,Ed Matuskey,"This game actually scared me a couple times, a...","A good game, but way too short!",942192000,10.0,,
3,5.0,False,"11 10, 1999",A1QA8K3LD9K892,B000021Y5F,Chris Adamson,Williams made games for hard-core arcade gamer...,A cool 80's artifact,942192000,68.0,,
4,5.0,True,"11 10, 1999",A3VWWQT4XDSBGQ,B00000K4AX,Joshua W. Fenton,"If you loved Half-Life, this is a must buy. I ...",AWESOME!,942192000,5.0,,


In [7]:
data = df[["overall", "reviewerID", "asin", "unixReviewTime"]]

In [8]:
data.rename(columns={"overall": "rating", "reviewerID": "userId", "asin": "itemId", "unixReviewTime": "timestamp"}, inplace=True)

In [9]:
data = data.iloc[-50000 :, :]

In [10]:
data.head()

Unnamed: 0,rating,userId,itemId,timestamp
447577,5.0,AUOQA5SOYSCI5,B00NMST9G8,1483574400
447578,5.0,A2A3L1JBYLBTS8,B004RMK57U,1483574400
447579,5.0,AEKTRRHPH99MI,B017GY07L4,1483574400
447580,1.0,A8LY6O6PNNMT2,B0149HT55K,1483574400
447581,5.0,A2CMJSSIIYG8TN,B00CJXYR3W,1483574400


In [11]:
df = getDF('meta_Video_Games.json.gz')

In [12]:
df = df[["asin", "title"]]
df.rename(columns={"asin": "itemId"}, inplace=True)

In [13]:
data = data.merge(df, how='left', on="itemId")
data.drop_duplicates(["userId", "itemId", "timestamp"], ignore_index=True, inplace=True)

In [14]:
data.head()

Unnamed: 0,rating,userId,itemId,timestamp,title
0,5.0,AUOQA5SOYSCI5,B00NMST9G8,1483574400,Xbox One Kinect Sensor with Dance Central Spot...
1,5.0,A2A3L1JBYLBTS8,B004RMK57U,1483574400,Playstation Plus: 3 Month Membership [Digital ...
2,5.0,AEKTRRHPH99MI,B017GY07L4,1483574400,Nights of Azure - PlayStation 4
3,1.0,A8LY6O6PNNMT2,B0149HT55K,1483574400,Mega Man Legacy Collection - PS4 [Digital Code]
4,5.0,A2CMJSSIIYG8TN,B00CJXYR3W,1483574400,The Evil Within - PC


In [15]:
data.loc[:, "rating"].value_counts()

5.0    33551
4.0     6994
3.0     3919
1.0     3053
2.0     1924
Name: rating, dtype: int64

In [16]:
data.loc[:, "userId"].value_counts()

AV4L1ENY6YOCM     64
A328B6CD2BRMG9    64
A2FDY4Y7DAUR7H    54
A31H1ECKRMHRRQ    52
A3TAU1P95J1PGX    52
                  ..
A3Q8X9MNQTADUC     1
A3G4BFD7FN8VF1     1
A36Y6O6893JQAT     1
A334J1SZQ4HY9S     1
A1TCZLL90BWIWH     1
Name: userId, Length: 15065, dtype: int64

In [17]:
data.loc[:, ["itemId"]].value_counts()

itemId    
B00ZQC73O8    506
B004RMK57U    357
B01GW3POY0    257
B00BGA9X9W    225
B00LCHZRIK    209
             ... 
B0025UMW4A      1
B0025P9I9C      1
B0025KZV8E      1
B00255V7JQ      1
B004YVC1CG      1
Length: 9873, dtype: int64

Разбиваем данные на тренировочные и тестовые.

In [18]:
def train_test_split(X, ratio=0.2, user_col='userId', item_col='itemId',
                     rating_col='rating', time_col='timestamp'):
    # список всех юзеров
    userIds = X[user_col].unique()

    X_train = X[[user_col, item_col]].copy()
    y_train = X[rating_col].values.copy()

    idxs_test = []
    for userId in tqdm_notebook(userIds):
        curUser = X[X[user_col] == userId]
        if curUser.shape[0] < 3: continue
        # определяем позицию, по которой делим выборку и размещаем данные по массивам
        idx = int(curUser.shape[0] * (1 - ratio))

        idxs_test += list(curUser.index[idx :].values)

    idxs_test.sort()

    y_train[idxs_test] = 0.                                                      # Зануляем рейтинги тестовых данных

    X_test = X[[user_col, item_col]].iloc[idxs_test, :].copy()
    y_test = X[rating_col].iloc[idxs_test].values.copy()

    return X_train, X_test, y_train, y_test, idxs_test

In [19]:
X_train, X_test, y_train, y_test, idxs_test = train_test_split(data, 0.2)

  0%|          | 0/15065 [00:00<?, ?it/s]

In [20]:
X_train.shape, len(y_train), X_test.shape, len(y_test)

((49441, 2), 49441, (9802, 2), 9802)

**Collaborative filtering.**

**User-based, item-based models.**

In [21]:
class UserItemBased(BaseEstimator):
    def __init__(self, isUserBasedModel):
        super().__init__()
        self.isUserBasedModel = isUserBasedModel

    def fit(self, X, y, user_col='userId', item_col='itemId'):
        X = X.copy()
        X['rating'] = y

        self.userIds = list(X[user_col].unique())
        self.itemIds = list(X[item_col].unique())
        ratings = X[[user_col, item_col, "rating"]].groupby([user_col, item_col]).mean().to_dict()["rating"]
        
        # Создаем разряженный массив рейтингов
        self.user_item_ratings = np.zeros((len(self.userIds), len(self.itemIds)))
        # Заполняем массив
        for key in list(ratings.keys()):
            userId, itemId = key
            userIdd = self.userIds.index(userId)    # Цифровой идентификатор пользователя
            itemIdd = self.itemIds.index(itemId)    # Цифровой идентификатор товара

            self.user_item_ratings[userIdd, itemIdd] = ratings[key]

        if not self.isUserBasedModel: self.user_item_ratings = np.transpose(self.user_item_ratings)       # Транспонируем разряженный массив

        # Массив рейтингов минус средний рейтинг по каждому пользователю (товару)
        arr_counts = np.count_nonzero(self.user_item_ratings, axis=1)
        arr_counts[arr_counts == 0] = 1
        self.user_item_ratings_mean = np.sum(self.user_item_ratings, axis=1) / arr_counts

        self.user_item_ratings_without_mean = self.user_item_ratings - self.user_item_ratings_mean[:, np.newaxis]
        self.user_item_ratings_without_mean[self.user_item_ratings == 0.] = 0.   # 0 - значит нет значения, поэтому обнуляем ячейки, где изначально был 0

        # Cчитаем попарную схожесть между юзерами (товарами)
        self.similarity = cosine_similarity(self.user_item_ratings_without_mean)

        return self

    def predict_rating(self, userId, itemId):
        # если в обучающей выборке нет такого предмета или пользователя, то вернём 0
        if not (userId in self.userIds) or not (itemId in self.itemIds): return 0.

        userIdd = self.userIds.index(userId)    # Цифровой идентификатор пользователя
        itemIdd = self.itemIds.index(itemId)    # Цифровой идентификатор товара

        if self.isUserBasedModel:
            rowId = userIdd; colId = itemIdd
        else:
            rowId = itemIdd; colId = userIdd

        if np.sum(np.abs(self.similarity[rowId, :])) == 1.: return 0.

        return self.user_item_ratings_mean[rowId] + \
               np.dot(self.similarity[rowId, :], self.user_item_ratings_without_mean[:, colId]) / (np.sum(np.abs(self.similarity[rowId, :])) - 1.)
    
    def predict(self, X, user_col='userId', item_col='itemId'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1).values
        return y

In [22]:
rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

**User-based model.**

In [23]:
uib = UserItemBased(isUserBasedModel=True).fit(X_train, y_train)

y_pred_ub = uib.predict(X_test)

print('rmse = {}'.format(rmse(y_test, y_pred_ub)))

rmse = 1.0440882803245444


**Item-based model.**

In [24]:
uib = UserItemBased(isUserBasedModel=False).fit(X_train, y_train)

y_pred = uib.predict(X_test)

print('rmse = {}'.format(rmse(y_test, y_pred)))

rmse = 1.792067635152466


Видим, что модель user-based хорошего качества, а item-based не очень.

In [25]:
pred_data = data.iloc[idxs_test, :].copy()

In [26]:
pred_data["pred_rating"] = y_pred_ub

In [27]:
data.loc[:, "userId"].value_counts()[150:160]

A34WYPF54VM9YP    17
AR7XJL057YUYT     17
A10JO6RV8UFYBV    17
A1KHY5UZ047RH0    17
A2E9BLUVKD88OV    17
A3119W966RPMKQ    17
AQPJOQ6R91OI6     17
A37TNAKEF9SMBX    17
AEX8BTN3JNTWH     17
ANBSEQJQOSUQG     17
Name: userId, dtype: int64

Рекомендуемые товары для пользователя с указанным идентификатором.

In [28]:
userId = "A39OON9L4QLPCF"

In [29]:
data_user = pred_data[pred_data["userId"] == userId]
data_user[data_user["pred_rating"] >= 4]

Unnamed: 0,rating,userId,itemId,timestamp,title,pred_rating
24576,5.0,A39OON9L4QLPCF,B00DC7G2W8,1499644800,Mario Kart 8 - Nintendo Wii U,4.818202
24581,3.0,A39OON9L4QLPCF,B014GEE6FK,1499644800,Nintendo Selects: Super Mario 3D World - Wii U...,4.818182
24583,5.0,A39OON9L4QLPCF,B01AC3ZD06,1499644800,Nintendo Selects: Super Mario 3D World,4.824349


Купленные товары пользователем с указанным  идентификатором.

In [30]:
data_ = data[data["userId"] == userId].copy()

data_.drop(labels = data_user.index, axis = 0, inplace=True)

data_[data_["rating"] >= 4.]

Unnamed: 0,rating,userId,itemId,timestamp,title
18564,5.0,A39OON9L4QLPCF,B0158TCVCC,1493942400,Super Mario Maker Console Deluxe Set - Nintend...
24485,5.0,A39OON9L4QLPCF,B00LCHZRIK,1499644800,The Legend of Zelda: Breath of the Wild - Wii U
24504,5.0,A39OON9L4QLPCF,B00KWG4HG0,1499644800,Mario Party 10
24516,5.0,A39OON9L4QLPCF,B0046EC9ZK,1499644800,Wii Remote Plus (Red)
24517,5.0,A39OON9L4QLPCF,B002BFVAV0,1499644800,Retro Bit - Retrolink USB SEGA Saturn Classic...
24524,5.0,A39OON9L4QLPCF,B014GEF6QI,1499644800,Super Mario Maker - Wii U [Digital Code]
24548,5.0,A39OON9L4QLPCF,B013EISYRU,1499644800,"LEGO Dimensions, Exclusive Ninjago Jay Fun Pac..."
24553,5.0,A39OON9L4QLPCF,B014KXME60,1499644800,Resident Evil Origins Collection - Xbox One St...
24560,5.0,A39OON9L4QLPCF,B00PHDC224,1499644800,PowerA Wired Mini Controllers For Xbox One
24575,5.0,A39OON9L4QLPCF,B00DD0B0BM,1499644800,Super Smash Bros. - Nintendo Wii U


In [31]:
data_[data_["rating"] < 4.]

Unnamed: 0,rating,userId,itemId,timestamp,title
24573,3.0,A39OON9L4QLPCF,B01GW3LR8M,1499644800,Dead Rising 4 - Xbox One


Видно, что рекомендуется игры, в которые он еще не играл и они по смыслу совпадают с уже проигранными, т.е. он играл в Марио и ему рекомендуются другие разновидности игр с Марио.

**Контентная фильтрация.**

In [32]:
cont_data = data.copy()
cont_data.drop_duplicates(subset=['title'], inplace=True)

In [33]:
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

titles = cont_data['title']

titles = titles.apply(lambda text: ' '.join([st for st in str(text).split(' ') if not st.isdigit()]))  # Удаляем все цифры

tfidf_matrix = tfidf.fit_transform(titles)

In [34]:
cosine_similarities = cosine_similarity(tfidf_matrix)

In [35]:
similarities = {}
for i in range(len(cosine_similarities)):
    # сортировка по схожести и запись индексов
    similar_indices = cosine_similarities[i].argsort()[:-50:-1] 
    # 50 самых похожих товаров
    similarities[cont_data['itemId'].iloc[i]] = [(cosine_similarities[i][x], cont_data.iloc[x]['title'], cont_data.iloc[x]['itemId']) for x in similar_indices][1:]

In [36]:
class ContentBasedRecommender:
    def __init__(self, matrix):
        self.matrix_similar = matrix

    def _print_message(self, item, recom_item):
        rec_items = len(recom_item)
        
        print(f'The {rec_items} recommended items are:')
        for i in range(rec_items):
            print(f"Number {i+1}:")
            print(f"{recom_item[i][1]} with {round(recom_item[i][0], 3)} similarity score") 
            print("--------------------")
        
    def recommend(self, recommendation, verbose = True):
        # Выбрать товар для поиска рекомендаций
        item = recommendation['item']
        # Выбрать число товаров для рекомендаций
        number_items = recommendation['item_number']
        # Выбрать товары, наиболее похожее из матрицы
        recom_item = self.matrix_similar[item][:number_items]
        # распечатать
        if verbose:
            self._print_message(item=item, recom_item=recom_item)
        return recom_item

In [37]:
recommedations = ContentBasedRecommender(similarities)

In [38]:
itemId = "B01GW3POY0"

In [39]:
data[data["itemId"] == itemId].head()

Unnamed: 0,rating,userId,itemId,timestamp,title
3774,5.0,A1AWYWB3FIPZEB,B01GW3POY0,1485216000,Resident Evil 7: Biohazard - PS4 Digital Code
3780,5.0,AM5Z8E4KPGL0K,B01GW3POY0,1485216000,Resident Evil 7: Biohazard - PS4 Digital Code
3784,4.0,A4VF4V6A4W0H7,B01GW3POY0,1485216000,Resident Evil 7: Biohazard - PS4 Digital Code
3802,1.0,A1U86MCXJLE57H,B01GW3POY0,1485216000,Resident Evil 7: Biohazard - PS4 Digital Code
3809,5.0,AXIQ99RS1E2JW,B01GW3POY0,1485216000,Resident Evil 7: Biohazard - PS4 Digital Code


In [40]:
recommendation = {
    "item": itemId,
    "item_number": 10
}

recom_item = recommedations.recommend(recommendation)

The 10 recommended items are:
Number 1:
Resident Evil 7 Biohazard - Xbox One with 0.819 similarity score
--------------------
Number 2:
Resident Evil 7 Biohazard [Online Game Code] with 0.802 similarity score
--------------------
Number 3:
Resident Evil (PS4 HD Remaster) - PS4 [Digital Code] with 0.617 similarity score
--------------------
Number 4:
Resident Evil 5 Gold Edition (Digital) - PS3 [Digital Code] with 0.604 similarity score
--------------------
Number 5:
Resident Evil 4 with 0.596 similarity score
--------------------
Number 6:
Resident Evil 2 with 0.596 similarity score
--------------------
Number 7:
Resident Evil 6 with 0.596 similarity score
--------------------
Number 8:
Resident Evil Essentials (Resident Evil Code: Veronica X / Resident Evil Outbreak / Resident Evil 4) with 0.578 similarity score
--------------------
Number 9:
Resident Evil 6 - PlayStation 4 with 0.569 similarity score
--------------------
Number 10:
Resident Evil 6 - Playstation 3 with 0.569 similarit

Видно, что рекомендуемые игры являются разновидносями указанной игры.

**Collaborative filtering.**

**Алгоритм SVD.**

In [41]:
#!pip install surprise

In [42]:
from surprise import Dataset, SVD
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split

In [43]:
train_data = X_train.copy()
train_data["y"] = y_train

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
trainset = Dataset.load_from_df(train_data, reader).build_full_trainset()

In [44]:
test_data = X_test.copy()
test_data["y"] = y_test

testset = list(test_data.itertuples(index=False))

In [45]:
# We'll use the famous SVD algorithm.
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 2.4415


2.4414873771098224

Полученная модель плохого качества.

In [54]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
dataset = Dataset.load_from_df(data[["userId", "itemId", "rating"]], reader)

In [55]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [56]:
# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 1.0808


1.0807911585799033

Если разбить на тренировочные и тестовые данные средствами самой библиотеки, то получается модель хорошего качества.