In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k, recall_at_k

ratings = pd.read_csv('data/ratings.csv')
books = pd.read_csv('data/books.csv')
tags = pd.read_csv('data/tags.csv')
tags_cleaned = pd.read_csv('data/tags_cleaned.xls')
book_tags = pd.read_csv('data/book_tags.csv')

In [2]:
books.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
book_id,10000,,,,5000.5,2886.9,1.0,2500.75,5000.5,7500.25,10000.0
goodreads_book_id,10000,,,,5264700.0,7575460.0,1.0,46275.8,394966.0,9382230.0,33288600.0
best_book_id,10000,,,,5471210.0,7827330.0,1.0,47911.8,425124.0,9636110.0,35534200.0
work_id,10000,,,,8646180.0,11751100.0,87.0,1008840.0,2719520.0,14517700.0,56399600.0
books_count,10000,,,,75.7127,170.471,1.0,23.0,40.0,67.0,3455.0
isbn,9300,9300.0,590431978,1.0,,,,,,,
isbn13,9415,,,,9755040000000.0,442862000000.0,195170000.0,9780320000000.0,9780450000000.0,9780830000000.0,9790010000000.0
authors,10000,4664.0,Stephen King,60.0,,,,,,,
original_publication_year,9979,,,,1981.99,152.577,-1750.0,1990.0,2004.0,2011.0,2017.0
original_title,9415,9274.0,,5.0,,,,,,,


In [3]:
books
# book_tags.tag_id.count
# tags_cleaned
# tags.tag_id.count

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9.780440e+12,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,7130616,7130616,7392860,19,441019455,9.780441e+12,Ilona Andrews,2010.0,Bayou Moon,...,17204,18856,1180,105,575,3538,7860,6778,https://images.gr-assets.com/books/1307445460m...,https://images.gr-assets.com/books/1307445460s...
9996,9997,208324,208324,1084709,19,067973371X,9.780680e+12,Robert A. Caro,1990.0,Means of Ascent,...,12582,12952,395,303,551,1737,3389,6972,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
9997,9998,77431,77431,2393986,60,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,...,9421,10733,374,11,111,1191,4240,5180,https://images.gr-assets.com/books/1455373531m...,https://images.gr-assets.com/books/1455373531s...
9998,9999,8565083,8565083,13433613,7,61711527,9.780062e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,...,11279,11994,1988,275,1002,3765,4577,2375,https://images.gr-assets.com/books/1279214118m...,https://images.gr-assets.com/books/1279214118s...


In [4]:
mapper = dict(zip(books.goodreads_book_id, books.book_id))

tags = pd.read_csv('data/tags_cleaned.xls')
book_tags = book_tags[book_tags.tag_id.isin(tags.tag_id)]
book_tags['id'] = book_tags.goodreads_book_id.apply(lambda x: mapper[x])
book_tags

Unnamed: 0,goodreads_book_id,tag_id,count,id
1,1,11305,37174,27
4,1,33114,12716,27
5,1,11743,9954,27
6,1,14017,7169,27
10,1,27199,3857,27
...,...,...,...,...
999877,33288638,9886,10,8892
999879,33288638,3358,10,8892
999880,33288638,1679,10,8892
999889,33288638,1659,9,8892


In [5]:
ratings_coo = sparse.coo_matrix((ratings.rating,(ratings.user_id,ratings.book_id)))
feature_ratings = sparse.coo_matrix(([1]*len(book_tags),(book_tags.id,book_tags.tag_id)))

In [6]:
#число потоков нашего процессора
NUM_THREADS = 8 

#число параметров вектора 
NUM_COMPONENTS = 30 

#число эпох обучения
NUM_EPOCHS = 10 

In [7]:
#Создаём модель
model = LightFM(learning_rate=0.05, loss='warp', no_components=NUM_COMPONENTS)
 
#Разбиваем наш датасет на обучающую и тестовую выборки
train,test = random_train_test_split(ratings_coo, test_percentage=0.2, random_state=None)

#Обучаем модель
model = model.fit(train, epochs=NUM_EPOCHS, num_threads=NUM_THREADS,item_features =feature_ratings,verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


In [8]:
import pickle

with open('savefile.pickle', 'wb') as fle:
    pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
#Тестируем нашу модель
prec_score = precision_at_k(
                     model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()
 
recall_at_k = recall_at_k(model,
                     test,
                     num_threads=NUM_THREADS,
                     k=10,
                     item_features=feature_ratings).mean()

print(recall_at_k,prec_score)

0.037623900663425476 0.08167303


In [10]:
# Достаём эбмеддинги

item_biases, item_embeddings = model.get_item_representations(features=feature_ratings)

In [14]:
import nmslib
 
#Создаём наш граф для поиска
nms_idx = nmslib.init(method='hnsw', space='cosinesimil')
 
#Начинаем добавлять наши книги в граф
nms_idx.addDataPointBatch(item_embeddings)
nms_idx.createIndex(print_progress=True)

In [15]:
#Вспомогательная функция для поиска по графу
def nearest_books_nms(book_id, index, n=10):
    nn = index.knnQuery(item_embeddings[book_id], k=n)
    return nn

# Проверяем модель

In [16]:
# Давайте попробуем написать рекомендации к какой-нибудь книге. Например, к роману «1984» Джорджа Оруэлла.
# Видим, что у книги id — 846.
books[books.original_title.str.find('1984')>=0].head(2)

# Ищем похожие книги.
nbm = nearest_books_nms(846,nms_idx)[0]

# Выводим похожие книги.
books[books.book_id.isin(nbm)]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
12,13,5470,5470,153313,995,451524934,9780452000000.0,"George Orwell, Erich Fromm, Celâl Üster",1949.0,Nineteen Eighty-Four,...,1956832,2053394,45518,41845,86425,324874,692021,908229,https://images.gr-assets.com/books/1348990566m...,https://images.gr-assets.com/books/1348990566s...
13,14,7613,7613,2207778,896,452284244,9780452000000.0,George Orwell,1945.0,Animal Farm: A Fairy Story,...,1881700,1982987,35472,66854,135147,433432,698642,648912,https://images.gr-assets.com/books/1424037542m...,https://images.gr-assets.com/books/1424037542s...
54,55,5129,5129,3204877,515,60929871,9780061000000.0,Aldous Huxley,1932.0,Brave New World,...,1022601,1079135,20095,26367,60328,219895,389379,383166,https://images.gr-assets.com/books/1487389574m...,https://images.gr-assets.com/books/1487389574s...
78,79,1381,1381,3356006,1703,143039954,9780143000000.0,"Homer, Robert Fagles, E.V. Rieu, Frédéric Mugl...",-720.0,Ὀδύσσεια,...,670326,710757,8101,29703,65629,183082,224120,208223,https://images.gr-assets.com/books/1390173285m...,https://images.gr-assets.com/books/1390173285s...
845,846,5472,5472,2966408,51,151010269,9780151000000.0,"George Orwell, Christopher Hitchens",1950.0,Animal Farm & 1984,...,116197,118761,1293,1212,3276,16511,40583,57179,https://images.gr-assets.com/books/1327959366m...,https://images.gr-assets.com/books/1327959366s...
902,903,667,667,287946,460,452281253,9780452000000.0,Ayn Rand,1938.0,Anthem,...,95620,106766,7096,6095,10982,27984,34074,27631,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
2141,2142,1375,1375,1474309,255,147712556,9780148000000.0,"Homer, Robert Fagles, Bernard Knox",-762.0,Ἰλιάς ; Ὀδύσσεια,...,47825,51098,537,916,2608,10439,17404,19731,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...
6154,6155,989313,989313,19248724,7,1593080212,9781593000000.0,"Joseph Conrad, A. Michael Matin",1899.0,,...,18873,19392,192,1061,1793,4477,5759,6302,https://images.gr-assets.com/books/1328851164m...,https://images.gr-assets.com/books/1328851164s...
6767,6768,616828,616828,1785260,64,451526570,9780452000000.0,Joseph Conrad,1910.0,Heart of Darkness,...,13649,15143,739,1110,1867,4127,4552,3487,https://s.gr-assets.com/assets/nophoto/book/11...,https://s.gr-assets.com/assets/nophoto/book/50...


In [34]:
book_id = books[books.original_title.str.find('The Silence of the Lambs')>=0].head(1).book_id.values[0]
nbm = nearest_books_nms(book_id,nms_idx)[0]
books[books.book_id.isin(nbm)].original_title

208     The Silence of the Lambs
273                The Godfather
430                   Red Dragon
767               Shutter Island
1484            The Black Dahlia
1801                    Hannibal
4421             Hannibal Rising
5312               A Simple Plan
9792        Strangers on a Train
Name: original_title, dtype: object