# Обучим и протестируем модель

In [2]:
!pip install lightfm

Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 1.5 MB/s eta 0:00:01
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25ldone
[?25h  Created wheel for lightfm: filename=lightfm-1.16-cp38-cp38-linux_x86_64.whl size=923738 sha256=4d5f7eaf50ba40dc0835af0301d57711876032bb5ee8e7d16751985243b4e2b3
  Stored in directory: /home/pavel/.cache/pip/wheels/ec/bb/51/9c487d021c1373b691d13cadca0b65b6852627b1f3f43550fa
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k
import pickle

In [4]:
ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
tags = pd.read_csv('data/tags.csv')
book_tags = pd.read_csv('data/book_tags.csv')

In [5]:
books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [6]:
mapper = dict(zip(books.goodreads_book_id,books.book_id))

In [7]:
tags = pd.read_csv('data/tags_cleaned.csv')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])

In [8]:
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27


In [9]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings  = sparse.coo_matrix(([1]*len(book_tags), (book_tags.id,book_tags.tag_id)))

Объявим вспомогательные константы для обучения модели:

In [10]:
#число потоков нашего процессора. Ставим 1, так как lightfm на macos ставится без OpenMP
NUM_THREADS = 1

#число параметров вектора 
NUM_COMPONENTS = 60

#число эпох обучения
NUM_EPOCHS = 10 

На этапе создания модели мы используем библиотеку LightFM, чтобы сделать матричное разложение (ALS) наших рейтингов книг и получить два набора векторов. 

In [11]:
#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)
 
#Разбиваем наш датасет на обучающую и тестовую выборки
train, test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings)

In [12]:
import datetime
print(datetime.datetime.now())

2022-03-02 12:06:11.464171


Протестируем модель

In [13]:
#Тестируем нашу модель
precision_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()
 
recall_score = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_score, precision_score)

0.04034600709208142 0.08727304


Сохраним модель

In [14]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file, protocol=pickle.HIGHEST_PROTOCOL)

# Добавим эмбеддинги к модели и посмотрим, что получилось

In [15]:
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

In [16]:
# Достаём эбмеддинги
item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [18]:
!pip install nmslib

Collecting nmslib
  Downloading nmslib-2.1.1-cp38-cp38-manylinux2010_x86_64.whl (13.4 MB)
[K     |████████████████████████████████| 13.4 MB 3.1 MB/s eta 0:00:01
Collecting pybind11<2.6.2
  Downloading pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
[K     |████████████████████████████████| 188 kB 3.7 MB/s eta 0:00:01
Installing collected packages: pybind11, nmslib
Successfully installed nmslib-2.1.1 pybind11-2.6.1


In [19]:
import nmslib

In [20]:
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [21]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

Найдем id книги 1984

In [26]:
books[books.original_title.str.find('Lambs')>=0].head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
208,209,23807,23807,22533,187,99446782,9780099000000.0,Thomas Harris,1988.0,The Silence of the Lambs,...,351107,366112,3866,10268,12845,55427,123652,163920,https://images.gr-assets.com/books/1390426249m...,https://images.gr-assets.com/books/1390426249s...


Теперь найдем все похожие книги и посмотрим на них

In [27]:
nbm = nearest_books_nms(209, nms_idx)[0]

In [28]:
books[books.book_id.isin(nbm)][['authors', 'original_title']]

Unnamed: 0,authors,original_title
208,Thomas Harris,The Silence of the Lambs
226,John Grisham,The Client
430,Thomas Harris,Red Dragon
767,Dennis Lehane,Shutter Island
1175,Dennis Lehane,Mystic River
1484,James Ellroy,The Black Dahlia
1801,Thomas Harris,Hannibal
3261,Patricia Highsmith,The Talented Mr. Ripley
5312,Scott B. Smith,A Simple Plan
9792,Patricia Highsmith,Strangers on a Train


Сохраним эмбеддинги

In [25]:
with open('item_embeddings.pkl', 'wb') as file:
    pickle.dump(item_embeddings, file, protocol=pickle.HIGHEST_PROTOCOL)

# Прототип на Streamlit

In [31]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.6.0-py2.py3-none-any.whl (9.7 MB)
[K     |████████████████████████████████| 9.7 MB 5.2 MB/s eta 0:00:01
Collecting pympler>=0.9
  Downloading Pympler-1.0.1-py3-none-any.whl (164 kB)
[K     |████████████████████████████████| 164 kB 5.4 MB/s eta 0:00:01
Collecting altair>=3.2.0
  Downloading altair-4.2.0-py3-none-any.whl (812 kB)
[K     |████████████████████████████████| 812 kB 5.2 MB/s eta 0:00:01
[?25hCollecting base58
  Downloading base58-2.1.1-py3-none-any.whl (5.6 kB)
Collecting validators
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting semver
  Downloading semver-2.13.0-py2.py3-none-any.whl (12 kB)
Collecting astor
  Downloading astor-0.8.1-py2.py3-none-any.whl (27 kB)
Collecting tzlocal
  Downloading tzlocal-4.1-py3-none-any.whl (19 kB)
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 4.5 MB/s eta 0:00:01
[?25hCollecting bli

Collecting backports.zoneinfo
  Downloading backports.zoneinfo-0.2.1-cp38-cp38-manylinux1_x86_64.whl (74 kB)
[K     |████████████████████████████████| 74 kB 2.4 MB/s eta 0:00:01
[?25hCollecting pytz-deprecation-shim
  Downloading pytz_deprecation_shim-0.1.0.post0-py2.py3-none-any.whl (15 kB)
Collecting tzdata
  Downloading tzdata-2021.5-py2.py3-none-any.whl (339 kB)
[K     |████████████████████████████████| 339 kB 4.6 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: blinker
  Building wheel for blinker (setup.py) ... [?25ldone
[?25h  Created wheel for blinker: filename=blinker-1.4-py3-none-any.whl size=13451 sha256=187aa17426faf718dbc61752ee7df805482a7d70a1cb7f1c92c4bf37411cfb4d
  Stored in directory: /home/pavel/.cache/pip/wheels/b7/a5/68/fe632054a5eadd531c7a49d740c50eb6adfbeca822b4eab8d4
Successfully built blinker
Installing collected packages: tzdata, smmap, backports.zoneinfo, pytz-deprecation-shim, gitdb, validators, tzlocal, semver, pympler, pydeck, pyarrow, gi

In [32]:
import streamlit as st
import numpy as np
import pandas as pd
import lightfm as lf
import nmslib
import pickle
import scipy.sparse as sparse

In [33]:
def nearest_books_nms(book_id, index, n=10):
    """Функция для поиска ближайших соседей, возвращает построенный индекс"""
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

In [34]:
def get_names(index):
    """
    input - idx of books
    Функция для возвращения имени книг
    return - list of names
    """
    names = []
    for idx in index:
        names.append('Book name:  {} '.format(
            name_mapper[idx]) + '  Book Author: {}'.format(author_mapper[idx]))
    return names

In [35]:
def read_files(folder_name='data'):
    """
    Функция для чтения файлов + преобразование к  нижнему регистру
    """
    ratings = pd.read_csv(folder_name+'/ratings.csv')
    books = pd.read_csv(folder_name+'/books.csv')
    books['title'] = books.title.str.lower()
    return ratings, books 

In [36]:
def make_mappers():
    """
    Функция для создания отображения id в title
    """
    name_mapper = dict(zip(books.book_id, books.title))
    author_mapper = dict(zip(books.book_id, books.authors))

    return name_mapper, author_mapper

In [37]:
def load_embeddings():
    """
    Функция для загрузки векторных представлений
    """
    with open('item_embeddings.pkl', 'rb') as f:
        item_embeddings = pickle.load(f)

    # Тут мы используем nmslib, чтобы создать наш быстрый knn
    nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
    nms_idx.addDataPointBatch(item_embeddings)
    nms_idx.createIndex(print_progress=True)
    return item_embeddings,nms_idx

In [38]:
#Загружаем данные
ratings, books  = read_files(folder_name='data') 
name_mapper, author_mapper = make_mappers()
item_embeddings, nms_idx = load_embeddings()

2022-03-02 13:19:37.180 INFO    nmslib: M                   = 16
2022-03-02 13:19:37.180 INFO    nmslib: indexThreadQty      = 6
2022-03-02 13:19:37.181 INFO    nmslib: efConstruction      = 200
2022-03-02 13:19:37.181 INFO    nmslib: maxM			          = 16
2022-03-02 13:19:37.182 INFO    nmslib: maxM0			          = 32
2022-03-02 13:19:37.182 INFO    nmslib: mult                = 0.360674
2022-03-02 13:19:37.183 INFO    nmslib: skip_optimized_index= 0
2022-03-02 13:19:37.183 INFO    nmslib: delaunay_type       = 2
2022-03-02 13:19:37.183 INFO    nmslib: Set HNSW query-time parameters:
2022-03-02 13:19:37.184 INFO    nmslib: ef(Search)         =20
2022-03-02 13:19:37.184 INFO    nmslib: algoType           =2
2022-03-02 13:19:37.387 INFO    nmslib: 
The vector space is CosineSimilarity
2022-03-02 13:19:37.387 INFO    nmslib: Vector length=60
2022-03-02 13:19:37.388 INFO    nmslib: searchMethod			  = 3
2022-03-02 13:19:37.389 INFO    nmslib: Making optimized index
2022-03-02 13:19:37.392 I

---

Теперь нужно в командной строке написать:

In [43]:
streamlit run app.py

SyntaxError: invalid syntax (<ipython-input-43-718866ff34b9>, line 1)