### Требуется построить модель рекомендаций на основе скрытых факторов (implicit) на основе dataset’а https://grouplens.org/datasets/hetrec-2011/ (Delicious Bookmarks)”

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

import scipy.sparse as sparse

%matplotlib inline
import warnings
warnings.simplefilter('ignore')
from implicit.als import AlternatingLeastSquares

In [8]:
raw_data = pd.read_table('user_taggedbookmarks-timestamps.dat',sep='\\n',delimiter='\\t')

In [11]:
raw_data.drop(raw_data.columns[3],axis=1, inplace=True)

In [13]:
raw_data.head(1)

Unnamed: 0,userID,bookmarkID,tagID
0,8,1,1


In [25]:
# tags per bookmark and unique user
raw_data = raw_data.groupby(['userID','bookmarkID'],as_index=False)['tagID'].count()
raw_data.rename(columns={'tagID':'tags_number'},inplace=True)

In [26]:
raw_data.head()

Unnamed: 0,userID,bookmarkID,tags_number
0,8,1,1
1,8,2,1
2,8,7,1
3,8,8,1
4,8,9,1


In [None]:
# remove null rows

In [27]:
data = raw_data.dropna()

In [49]:
# range unique userID
unique_id = range(len(set(data['userID'].unique())))

In [58]:
# range unique bookmarkID
unique_id_book = range(len(set(data['bookmarkID'].unique())))

In [64]:
# dictionaries
userID_lookup = dict(zip(unique_id,data['userID'].unique()))
bookmarkID_lookup = dict(zip(unique_id_book,data['bookmarkID'].unique()))

In [77]:
# make dictionary to match unique user ID with user number
user_dic = {user:number for number,user in userID_lookup.items()}

In [None]:
# make dictionary to match unique bookmarkID with bookmark number

In [80]:
bookmark_dic = {book:number for number,book in bookmarkID_lookup.items()}

In [81]:
# apply map function to update data
data['userID'] = data['userID'].map(user_dic)
data['bookmarkID'] = data['bookmarkID'].map(bookmark_dic)

In [82]:
data.head()

Unnamed: 0,userID,bookmarkID,tags_number
0,0,0,1
1,0,1,1
2,0,2,1
3,0,3,1
4,0,4,1


In [88]:
url_id = pd.read_table('bookmarks.dat',sep='\\n', delimiter='\\t')

In [90]:
url_id.head(1)

Unnamed: 0,id,md5,title,url,md5Principal,urlPrincipal
0,1,ab4954b633ddaf5b5bba6e9b71aa6b70,IFLA - The official website of the Internation...,http://www.ifla.org/,7f431306c428457bc4e12b15634484f,www.ifla.org


In [91]:
# remove useless columns
url_id.drop(['md5','md5Principal'],axis=1,inplace=True)

In [92]:
url_id.head(1)

Unnamed: 0,id,title,url,urlPrincipal
0,1,IFLA - The official website of the Internation...,http://www.ifla.org/,www.ifla.org


In [93]:
ids = list(np.sort(data['userID'].unique()))
urls = list(np.sort(data['bookmarkID'].unique()))

In [95]:
rows = data['userID']
cols = data['bookmarkID']
tags_num = data['tags_number']

In [96]:
len(tags_num), len(cols), len(rows)

(104799, 104799, 104799)

In [97]:
data_sparse = sparse.csr_matrix((tags_num, (cols, rows)), shape=(len(urls), len(ids)))

In [98]:
model = AlternatingLeastSquares(factors=50)
model.fit(data_sparse)

100%|██████████| 15.0/15 [00:32<00:00,  2.17s/it]


In [101]:
userid = 345
print('рекомендации для пользователя {}'.format(user_id_lookup[userid]))
recommendations = model.recommend(userid, data_sparse)

рекомендации для пользователя 13191


In [109]:
recommendations

[(5932, 0.2477181),
 (6623, 0.20902392),
 (6430, 0.19734591),
 (19992, 0.18046561),
 (19989, 0.1755175),
 (20006, 0.1748682),
 (19969, 0.17420708),
 (19970, 0.17194818),
 (19990, 0.17123522),
 (19968, 0.17106175)]

In [126]:
list_rec = []
for r in recommendations:
    list_rec.append(bookmarkID_lookup[(r[0])])

In [127]:
list_rec

[8294, 9320, 8981, 29335, 29331, 29352, 29308, 29309, 29332, 29306]

In [128]:
# what we will recommend user 13191
url_id.loc[list_rec]

Unnamed: 0,id,title,url,urlPrincipal
8294,11905,9 Super Annoying Twitter Personality Types,http://www.socialtimes.com/2010/10/annoying-tw...,www.socialtimes.com
9320,13291,Body By Victoria - Secure Computing: Sec-C,http://www.hackerfactor.com/blog/index.php?/ar...,www.hackerfactor.com
8981,12756,» EuroIA 10 report: day 2 Johnny Holland – It&...,http://johnnyholland.org/2010/09/25/euroia-10-...,johnnyholland.org
29335,43663,Hire Art: Five Artists on What It Means to Wor...,http://www.good.is/post/hire-art-five-artists-...,www.good.is
29331,43659,Jailed Chinese Dissident&#039;s &#039;Final St...,http://thelede.blogs.nytimes.com/2010/10/08/ja...,thelede.blogs.nytimes.com
29352,43687,Berenstain Bears . Games | PBS Kids,http://pbskids.org/berenstainbears/games/story...,pbskids.org
29308,43618,A Hidden Genius at the Apple Store / Duncan Da...,http://duncandavidson.com/blog/2010/10/apple-s...,duncandavidson.com
29309,43619,The making of Ephemicropolis on Vimeo,http://vimeo.com/10875342,vimeo.com
29332,43660,HOW TO: Export Your Illustrator Images to HTML...,http://mashable.com/2010/10/12/how-to-export-y...,mashable.com
29306,43614,blekko | /about,http://blekko.com/ws/+/about?h=1,blekko.com


In [131]:
itemid = 2233
print('закладки схожие с {}'.format(bookmark_id_lookup[itemid]))

закладки схожие с 3116


In [132]:
related = model.similar_items(itemid)

In [133]:
related

[(2208, 0.10685859),
 (2234, 0.10685859),
 (2209, 0.10685859),
 (2217, 0.10685859),
 (2236, 0.10685859),
 (2237, 0.10685859),
 (2229, 0.10685859),
 (2211, 0.10685859),
 (2210, 0.10685859),
 (2212, 0.10685859)]

In [135]:
list_rec = []
for r in related:
    list_rec.append(bookmarkID_lookup[(r[0])])

In [136]:
list_rec

[3082, 3119, 3083, 3093, 3125, 3126, 3111, 3086, 3085, 3087]

In [137]:
# what is related to bookmark 3116
url_id.loc[list_rec]

Unnamed: 0,id,title,url,urlPrincipal
3082,4415,The Mansurovs,http://mansurovs.com/,mansurovs.com
3119,4455,Security Lessons Learned From The Diaspora Lau...,http://www.kalzumeus.com/2010/09/22/security-l...,www.kalzumeus.com
3083,4416,17 Useful Htaccess Tricks and Tips,http://www.queness.com/post/5421/17-useful-hta...,www.queness.com
3093,4427,Chocomoko,http://www.chocomoko.com/sweetfm,www.chocomoko.com
3125,4463,ericmiraglia.com » The 2010 Web Application (R...,http://ericmiraglia.com/blog/?p=425,ericmiraglia.com
3126,4464,Chris Shiflett: Twitter OAuth,http://shiflett.org/blog/2010/sep/twitter-oauth,shiflett.org
3111,4446,Why we don&#039;t have a parent selector - Sno...,http://snook.ca/archives/html_and_css/css-pare...,snook.ca
3086,4420,What Happened to Yahoo,http://paulgraham.com/yahoo.html,paulgraham.com
3085,4418,Firesheep countermeasure tool BlackSheep,http://www.net-security.org/secworld.php?id=10118,www.net-security.org
3087,4421,Avoiding Cache Stampedes with Pseudo-locks | P...,http://purplerockscissors.com/php-development/...,purplerockscissors.com
