# **Recommendation Engine With Collaborative Filtering**


In [1]:
pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357281 sha256=30f9e47ef028a603ddf110426eaf381e18c8b3b5607768f0c04859deba9aeaa2
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Succe

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## The Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
folder_dir = '/content/drive/MyDrive/Project/data rcm system'

In [5]:
animes = pd.read_csv(folder_dir + '/anime.csv')
animes.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [6]:
animes.describe()

Unnamed: 0,anime_id,rating,members
count,12294.0,12064.0,12294.0
mean,14058.221653,6.473902,18071.34
std,11455.294701,1.026746,54820.68
min,1.0,1.67,5.0
25%,3484.25,5.88,225.0
50%,10260.5,6.57,1550.0
75%,24794.5,7.18,9437.0
max,34527.0,10.0,1013917.0


In [7]:
animes.shape

(12294, 7)

In [8]:
ratings = pd.read_csv(folder_dir + '/rating.csv')
ratings.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [9]:
ratings.describe()

Unnamed: 0,user_id,anime_id,rating
count,7813737.0,7813737.0,7813737.0
mean,36727.96,8909.072,6.14403
std,20997.95,8883.95,3.7278
min,1.0,1.0,-1.0
25%,18974.0,1240.0,6.0
50%,36791.0,6213.0,7.0
75%,54757.0,14093.0,9.0
max,73516.0,34519.0,10.0


In [10]:
ratings.shape

(7813737, 3)

In [11]:
ratings['user_id'].value_counts()

user_id
48766    10227
42635     3747
53698     2905
57620     2702
59643     2633
         ...  
11323        1
48775        1
11328        1
48743        1
46734        1
Name: count, Length: 73515, dtype: int64

In [12]:
ratings['user_id'].unique().shape

(73515,)

In [13]:
x = ratings['user_id'].value_counts() > 30 #ng rate trên 30 anime
# trả về True cho ng rate trên 30, ngược lại False
x[x].shape # boolean indexing
# dòng này lọc những giá trị True, sau đó lấy shape

(47883,)

In [14]:
x[x].head()

user_id
48766    True
42635    True
53698    True
57620    True
59643    True
Name: count, dtype: bool

In [15]:
y = x[x].index
y

Index([48766, 42635, 53698, 57620, 59643, 51693, 27364, 45659,  7345, 66021,
       ...
       26961, 26578, 36244, 64769, 45504, 61899,  2936, 21623, 33295, 38285],
      dtype='int64', name='user_id', length=47883)

In [16]:
ratings = ratings[ratings['user_id'].isin(y)]
# hàm isin dùng để filter & select data trong dataframe dựa trên dkien đã cho

In [17]:
ratings

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1
...,...,...,...
7813730,73515,13659,8
7813731,73515,14345,7
7813732,73515,16512,7
7813733,73515,17187,9


In [18]:
ratings.shape

(7499111, 3)

## Merge anime with ratings via anime_id

In [19]:
ratings_with_anime = ratings.merge(animes, on='anime_id')

In [20]:
ratings_with_anime.head()

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members
0,1,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
1,3,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
2,5,20,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
3,6,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297
4,21,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297


In [21]:
ratings_with_anime.shape

(7499101, 9)

In [22]:
# ratings/anime
import html

num_ratings = ratings_with_anime.groupby('name')['rating_x'].count().reset_index()
num_ratings['name'] = num_ratings['name'].apply(html.unescape) # xử lý lỗi ký tự "" trong tên anime
num_ratings.head()

# groupby: gom giá trị giống nhau lại (ở đây là 'name') rồi
# tính toán (count/mean/sum..) trên rating_x

Unnamed: 0,name,rating_x
0,"""0""",26
1,"""Aesop"" no Ohanashi yori: Ushi to Kaeru, Yokub...",1
2,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",778
3,"""Bungaku Shoujo"" Memoire",805
4,"""Bungaku Shoujo"" Movie",1523


In [23]:
num_ratings.rename(columns={'rating_x': 'num_of_ratings'}, inplace=True)
num_ratings.head()

Unnamed: 0,name,num_of_ratings
0,"""0""",26
1,"""Aesop"" no Ohanashi yori: Ushi to Kaeru, Yokub...",1
2,"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",778
3,"""Bungaku Shoujo"" Memoire",805
4,"""Bungaku Shoujo"" Movie",1523


In [24]:
final_ratings = ratings_with_anime.merge(num_ratings, on='name')
final_ratings

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members,num_of_ratings
0,1,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297,21541
1,3,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297,21541
2,5,20,6,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297,21541
3,6,20,-1,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297,21541
4,21,20,8,Naruto,"Action, Comedy, Martial Arts, Shounen, Super P...",TV,220,7.81,683297,21541
...,...,...,...,...,...,...,...,...,...,...
7386997,65682,30450,8,Dr. Slump: Hoyoyo! Arale no Himitsu Dai Koukai...,"Comedy, Sci-Fi, Shounen",Special,1,6.17,248,1
7386998,69497,33484,10,Shiroi Zou,"Action, Historical, Kids",Movie,1,4.71,45,1
7386999,70463,29481,-1,Kakinoki Mokkii,"Fantasy, Kids",Special,1,4.33,61,1
7387000,72404,34412,-1,Hashiri Hajimeta bakari no Kimi ni,Music,Music,1,6.76,239,1


In [25]:
final_ratings = final_ratings[final_ratings['num_of_ratings'] >= 50]
final_ratings.sample(10)

Unnamed: 0,user_id,anime_id,rating_x,name,genre,type,episodes,rating_y,members,num_of_ratings
5493220,16830,31376,7,Flying Witch,"Comedy, Magic, Shounen, Slice of Life, Superna...",TV,12,7.65,95933,1628
376449,42194,8074,8,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892,20916
3802854,62208,790,7,Ergo Proxy,"Mystery, Psychological, Sci-Fi",TV,23,8.03,265005,7280
5883906,49620,8476,8,Otome Youkai Zakuro,"Demons, Historical, Military, Romance, Seinen,...",TV,13,7.6,61957,2303
6302503,11943,7588,9,Saraiya Goyou,"Historical, Samurai, Seinen",TV,12,7.93,40819,995
4989002,28433,10161,10,No.6,"Action, Sci-Fi",TV,11,7.76,168017,5864
1576406,20153,205,10,Samurai Champloo,"Action, Adventure, Comedy, Historical, Samurai...",TV,26,8.5,390076,12840
1626811,65871,5341,-1,Ookami to Koushinryou II,"Adventure, Fantasy, Historical, Romance",TV,12,8.46,210491,8279
3060055,54508,6045,7,Kimi ni Todoke,"Romance, School, Shoujo, Slice of Life",TV,25,8.19,309339,10729
3367855,52358,4192,9,Hayate no Gotoku!!,"Action, Comedy, Harem, Parody, Romance",TV,25,7.92,67323,2970


In [26]:
final_ratings.drop_duplicates(['user_id', 'name'], inplace=True)
final_ratings.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_ratings.drop_duplicates(['user_id', 'name'], inplace=True)


(7323746, 10)

## Pivot table for animes ratings


In [27]:
import html
import re
# Giải mã các ký tự HTML trong cột `name`
final_ratings['name'] = final_ratings['name'].apply(html.unescape)

# Loại bỏ các ký tự đặc biệt không mong muốn
final_ratings['name'] = final_ratings['name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# Kiểm tra dữ liệu đã làm sạch
print(final_ratings['name'].unique())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_ratings['name'] = final_ratings['name'].apply(html.unescape)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_ratings['name'] = final_ratings['name'].apply(lambda x: re.sub(r'[^\w\s]', '', x))


['Naruto' 'School Rumble' 'Shuffle' ... 'Magic Knight Rayearth Omake'
 'Taisei Kensetsu Sri Lanka Kousokudouro'
 'Doraemon Movie 21 Nobita no Taiyou Ou Densetsu']


In [28]:
animes_pivot = final_ratings.pivot_table(columns = 'user_id', index = 'name', values = 'rating_x')
animes_pivot.head()

user_id,1,3,4,5,6,7,11,13,14,17,...,73499,73500,73501,73502,73503,73504,73507,73510,73513,73515
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
009 ReCyborg,,,,,,,,,,,...,,,,,,,,,,
0091,,,,,,,,,,,...,,,,,,,,,,
07Ghost,,,,,,,,,,,...,,8.0,,,,,,,,
1000nen Joou Queen Millennia,,,,,,,,,,,...,,,,,,,,,,
1001 Nights,,,,,,,,,,,...,,,,,,,,,,


In [29]:
animes_pivot.shape

(5483, 47883)

In [30]:
animes_pivot.fillna(0, inplace = True)
animes_pivot

user_id,1,3,4,5,6,7,11,13,14,17,...,73499,73500,73501,73502,73503,73504,73507,73510,73513,73515
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
009 ReCyborg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000nen Joou Queen Millennia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1001 Nights,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
xxxHOLiC,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,8.0,0.0,10.0,0.0,0.0,0.0
xxxHOLiC Kei,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,8.0,0.0,9.0,0.0,0.0,0.0
xxxHOLiC Movie Manatsu no Yoru no Yume,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,10.0,0.0,0.0,9.0,0.0,0.0,0.0
xxxHOLiC Rou,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from scipy.sparse import csr_matrix
# ma trận thưa lưu giữ giá trị khác 0

animes_sparse = csr_matrix(animes_pivot) # chuyển từ dataframe sang ma trận thưa
animes_sparse

<5483x47883 sparse matrix of type '<class 'numpy.float64'>'
	with 7278494 stored elements in Compressed Sparse Row format>

## Build model

In [32]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(animes_sparse)

In [34]:
distance, suggestion = model_knn.kneighbors(animes_pivot.iloc[100:101].values.reshape(1,-1), n_neighbors=6)

In [35]:
distance

array([[0.        , 0.48654633, 0.54721484, 0.78022049, 0.78048756,
        0.78393401]])

In [36]:
suggestion

array([[ 100,  101,   99, 1337, 1885,  437]])

In [37]:
for i in range(len(suggestion)):
  print(animes_pivot.index[suggestion[i]])

Index(['Air Gear Kuro no Hane to Nemuri no Mori  Break on the Sky',
       'Air Gear Special', 'Air Gear', 'Freezing', 'Highschool of the Dead',
       'BenTo'],
      dtype='object', name='name')


In [38]:
animes_pivot.index[100]

'Air Gear Kuro no Hane to Nemuri no Mori  Break on the Sky'

In [39]:
animes_name = animes_pivot.index
animes_name

Index(['009 ReCyborg', '0091', '07Ghost', '1000nen Joou Queen Millennia',
       '1001 Nights', '11eyes', '11eyes Momoiro Genmutan',
       '11eyes Picture Drama', '11nin Iru', '12Paradise',
       ...
       'hackThe Movie Sekai no Mukou ni', 'hackUnison',
       'hackVersus The Thanatos Report', 'iDOLMSTER Xenoglossia', 'sCRYed',
       'xxxHOLiC', 'xxxHOLiC Kei', 'xxxHOLiC Movie Manatsu no Yoru no Yume',
       'xxxHOLiC Rou', 'xxxHOLiC Shunmuki'],
      dtype='object', name='name', length=5483)

In [41]:
import pickle
pickle.dump(model_knn,open('model_knn.pkl','wb'))
pickle.dump(animes_name,open('animes_name.pkl','wb'))
pickle.dump(final_ratings,open('final_ratings.pkl','wb'))
pickle.dump(animes_pivot,open('animes_pivot.pkl','wb'))

In [42]:
def recommend_anime(anime_name):
    anime_id = np.where(animes_pivot.index == animes_name)[0][0]
    distance, suggestion = model_knn.kneighbors(animes_pivot.iloc[anime_id,:].values.reshape(1,-1), n_neighbors=6)

    for i in range(len(suggestion)):
        anime = animes_pivot.index[suggestion[i]]
        for j in anime:
            print(j)

In [43]:
anime_name = "11eyes Picture Drama"
recommend_anime(anime_name)

009 ReCyborg
Cyborg 009 VS Devilman
Arve Rezzle Kikaijikake no Youseitachi
Mardock Scramble The First Compression
Captain Harlock
Towa no Quon 1 Utakata no Kaben
