# Find similar anime from embedding
---

In [1]:
!pip install -q fastai==2.2.7

[K     |████████████████████████████████| 194kB 20.8MB/s 
[K     |████████████████████████████████| 776.8MB 20kB/s 
[K     |████████████████████████████████| 12.8MB 12.0MB/s 
[K     |████████████████████████████████| 61kB 8.5MB/s 
[31mERROR: torchtext 0.9.1 has requirement torch==1.8.1, but you'll have torch 1.7.1 which is incompatible.[0m
[?25h

In [None]:
!wget https://github.com/opalchonlapat/anime-recommend/raw/master/anime.zip # Download dataset
!unzip anime.zip -d anime-recommedations-datbase
!wget https://github.com/opalchonlapat/anime-recommend/raw/master/models/model.pth # Download model

In [5]:
from fastai.collab import *
from fastai.tabular.all import *
from fastai import *
from ipywidgets import interact, interactive, fixed

import numpy as np
import pandas as pd
import torch
import ipywidgets as widgets

In [6]:
rating_df = pd.read_csv("anime-recommedations-datbase/rating.csv")
anime_df = pd.read_csv("anime-recommedations-datbase/anime.csv")

rating_df = rating_df[rating_df.rating >= 0].reset_index(drop=True) # filter unrating anime

cnt_df = rating_df.groupby(['user_id'])['anime_id'].count().reset_index(name='num_rate_anime')
user_id_min = cnt_df.loc[cnt_df['num_rate_anime'] >= 10, 'user_id'].values
rating_df_min = rating_df[rating_df['user_id'].isin(user_id_min)] # filter user rated anime >= 10
rating_df_min

Unnamed: 0,user_id,anime_id,rating
5,3,20,8
6,3,154,6
7,3,170,9
8,3,199,10
9,3,225,9
...,...,...,...
6337234,73515,13659,8
6337235,73515,14345,7
6337236,73515,16512,7
6337237,73515,17187,9


In [7]:
rating_grouped_df = rating_df_min.groupby(['user_id'])
valid_df = rating_grouped_df.apply(lambda x: x.sample(frac=.2, random_state=1))
valid_df.index = valid_df.index.droplevel(level=0)
train_df = rating_df_min[~rating_df_min.index.isin(valid_df.index)]

valid_df = valid_df.reset_index(drop=True)
valid_grouped_df = valid_df.groupby(['user_id'])
test_df = valid_grouped_df.apply(lambda x: x.sample(frac=.5, random_state=1))
test_df.index = test_df.index.droplevel(level=0)
valid_df = valid_df[~valid_df.index.isin(test_df.index)]

In [9]:
n_users, n_animes, n_ratings = list(train_df.nunique())

train_df = train_df.copy()
train_df['is_valid'] = 0

valid_df = valid_df.copy()
valid_df['is_valid'] = 1

train_valid_df = pd.concat([train_df, valid_df], axis=0)
train_valid_df

Unnamed: 0,user_id,anime_id,rating,is_valid
5,3,20,8,0
6,3,154,6,0
8,3,199,10,0
9,3,225,9,0
10,3,341,6,0
...,...,...,...,...
1255382,73515,2889,8,1
1255384,73515,935,9,1
1255393,73515,3653,8,1
1255394,73515,1361,6,1


In [10]:
user_name   = 'user_id'
item_name   = 'anime_id'
rating_name = 'rating'
cat_names = [user_name,item_name]
splits = ColSplitter(col='is_valid')(train_valid_df)
to = TabularCollab(train_valid_df, [Categorify], cat_names, y_names=[rating_name], y_block=TransformBlock(), splits=splits)
dls = to.dataloaders(bs=256)

In [12]:
emb_szs = {'user_id': 50, 'anime_id': 50}
szs = get_emb_sz(dls.train_ds, emb_szs)
print(f"Embedding size (user, item): {szs}, {emb_szs}")
learn = collab_learner(dls, # Dataloader ที่จะนำมา train model
                       y_range=(0.5, 10.5), # Range ของค่าคะแนน rating
                       use_nn=True, # ใช้ Neural Network
                       emb_szs=emb_szs, # กำหนดขนาดของ Embedding
                       metrics=[mse, # Metrics ที่ใช้วัดผล
                                mae]).to_fp16()

Embedding size (user, item): [(55119, 50), (9726, 50)], {'user_id': 50, 'anime_id': 50}




In [13]:
# load model
learn.load('/content/model')

  elif with_opt: warn("Saved filed doesn't contain an optimizer state.")


<fastai.learner.Learner at 0x7fdd7a83c7d0>

In [14]:
learn.model

EmbeddingNN(
  (embeds): ModuleList(
    (0): Embedding(55119, 50)
    (1): Embedding(9726, 50)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (bn_cont): BatchNorm1d(0, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): LinBnDrop(
      (0): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): Linear(in_features=100, out_features=50, bias=False)
      (2): ReLU(inplace=True)
    )
    (1): LinBnDrop(
      (0): Linear(in_features=50, out_features=1, bias=True)
    )
    (2): SigmoidRange(low=0.5, high=10.5)
  )
)

In [15]:
user_embeds = learn.model.embeds[0].weight
item_embeds = learn.model.embeds[1].weight
user_ids = learn.dls.train_ds.classes['user_id']
item_ids = learn.dls.train_ds.classes['anime_id']

In [16]:
def find_similar_anime(anime_name):
    print(f"Anime name: {anime_name}")
    ani_id = anime_df.loc[anime_df['name'] == anime_name].iat[0, 0]
    arr_idx = item_ids.o2i[ani_id]
    similar = nn.CosineSimilarity(dim=1)(item_embeds, item_embeds[arr_idx][None])
    sim_arr_idx = similar.argsort(descending=True)
    most_sim_idx = sim_arr_idx[1:6]
    least_sim_idx = sim_arr_idx[-5:]
    sim_ani_id = np.concatenate((item_ids[most_sim_idx], item_ids[least_sim_idx]))

    sim_df = pd.DataFrame()
    for id in sim_ani_id:
        sim_df = sim_df.append(anime_df[anime_df['anime_id'] == id])
    sim_df['similar'] = np.concatenate((similar[most_sim_idx].cpu().detach().numpy(), similar[least_sim_idx].cpu().detach().numpy()))
    return sim_df

In [17]:
anime_cnt = rating_df.groupby(['anime_id'])['user_id'].count().reset_index(name='cnt')
anime_id_cnt = anime_cnt.sort_values(['cnt'], ascending=False).loc[:50, 'anime_id'].values

In [18]:
names_arr = anime_df.loc[anime_df['anime_id'].isin(anime_id_cnt), 'name'].unique()
interact(find_similar_anime, anime_name=names_arr)

interactive(children=(Dropdown(description='anime_name', options=('Fullmetal Alchemist: Brotherhood', 'Steins;…

<function __main__.find_similar_anime>