### Dựa vào bảng embedding_anime đã tạo sẵn sau bước phân tích và xử lý riêng biệt trước đó, ta phân tích phim người dùng thích dựa vào dữ liệu đó
    - Dữ liệu bao gồm anime_id, name và description -> ta cần ánh xạ đoạn description này thành embedding
    - Sau đó tính cosine similarity với những phim mà người dùng thích trong tập train và trả về tập dữ đoán

In [1]:
import nltk
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np
import metrics_eval

from functools import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_colwidth', None)

In [2]:
embedding_anime = pd.read_csv("Dataset/embedding_anime.csv", index_col=0)

In [3]:
embedding_anime.head(10)

Unnamed: 0,anime_id,name,description
0,32281,Kimi no Na wa.,"Movie, Top-rated, Viral, Short Anime, Drama,Romance,School,Supernatural"
1,5114,Fullmetal Alchemist: Brotherhood,"Television, Top-rated, Viral, Long-running Anime, Action,Adventure,Drama,Fantasy,Magic,Military,Shounen"
2,28977,Gintama°,"Television, Top-rated, Viral, Long-running Anime, Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen"
3,9253,Steins;Gate,"Television, Top-rated, Viral, Long-running Anime, Sci-Fi,Thriller"
4,9969,Gintama&#039;,"Television, Top-rated, Viral, Long-running Anime, Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen"
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Gakuen Koukou,"Television, Top-rated, Viral, Long-running Series, Comedy,Drama,School,Shounen,Sports"
6,11061,Hunter x Hunter (2011),"Television, Top-rated, Viral, Long-running Series, Action,Adventure,Shounen,Super Power"
7,820,Ginga Eiyuu Densetsu,"Original Video Animation, Top-rated, Viral, Long-running Series, Drama,Military,Sci-Fi,Space"
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien Nare,"Movie, Top-rated, Viral, Short Anime, Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen"
9,15417,Gintama&#039;: Enchousen,"Television, Top-rated, Viral, Long-running Anime, Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen"


In [4]:
embedding_anime.shape

(12017, 3)

In [5]:
ps = PorterStemmer()

In [6]:
def stems(text: str):
    l = list(map(ps.stem, text.split()))
    
    return " ".join(l)

In [7]:
embedding_anime['tags'] = embedding_anime['description'].apply(stems)

In [8]:
embedding_anime.head(5)

Unnamed: 0,anime_id,name,description,tags
0,32281,Kimi no Na wa.,"Movie, Top-rated, Viral, Short Anime, Drama,Romance,School,Supernatural","movie, top-rated, viral, short anime, drama,romance,school,supernatur"
1,5114,Fullmetal Alchemist: Brotherhood,"Television, Top-rated, Viral, Long-running Anime, Action,Adventure,Drama,Fantasy,Magic,Military,Shounen","television, top-rated, viral, long-run anime, action,adventure,drama,fantasy,magic,military,shounen"
2,28977,Gintama°,"Television, Top-rated, Viral, Long-running Anime, Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen","television, top-rated, viral, long-run anime, action,comedy,historical,parody,samurai,sci-fi,shounen"
3,9253,Steins;Gate,"Television, Top-rated, Viral, Long-running Anime, Sci-Fi,Thriller","television, top-rated, viral, long-run anime, sci-fi,thril"
4,9969,Gintama&#039;,"Television, Top-rated, Viral, Long-running Anime, Action,Comedy,Historical,Parody,Samurai,Sci-Fi,Shounen","television, top-rated, viral, long-run anime, action,comedy,historical,parody,samurai,sci-fi,shounen"


In [9]:
cv = CountVectorizer(max_features=1000, stop_words='english')

vector = cv.fit_transform(embedding_anime['tags']).toarray()


In [10]:
vector.shape

(12017, 93)

In [11]:
vector = vector.astype(np.float32)
row_norm = np.linalg.norm(vector, axis=1, keepdims=True) + 1e-12
vector = vector / row_norm

item_ids = embedding_anime['anime_id'].to_numpy()
row_of = {int(a): i for i, a in enumerate(item_ids)}
n_items, dim = vector.shape

topk_neighbors = 200  


import faiss
index = faiss.IndexFlatIP(dim)
index.add(vector.astype('float32'))
sim_all, idx_all = index.search(vector.astype('float32'), topk_neighbors + 1)

# loại self ở cột 0
nbr_idx = idx_all[:, 1:].astype(np.int32)
nbr_sim = sim_all[:, 1:].astype(np.float32)


In [12]:
nbr_idx

array([[  208,    60,  1111, ...,   944,   911,   881],
       [  200,   290,   268, ...,   119,    43,    16],
       [    4,     2,    12, ...,  2539,  2419,  2355],
       ...,
       [11612, 11996, 11932, ...,  6747,  6690, 11954],
       [12013, 12012, 12011, ..., 10209, 10205, 10160],
       [11989, 10052, 10042, ...,  6956,  6930,  6895]],
      shape=(12017, 200), dtype=int32)

- Mỗi phim chỉ còn top 200 phim gần nhất thôi

- similarity là sự tương đồng giữa từng phim đôi một với nhau

In [13]:
def similar_items(anime_id: int, topk: int = 20):
    i = row_of.get(int(anime_id), None)
    if i is None:
        return []
    
    idxs = nbr_idx[i, :topk].tolist()
    return [int(item_ids[idx])for idx in idxs]

In [14]:
anime_recommend = list(map(lambda x: embedding_anime[embedding_anime['anime_id'] == x]['name'].iloc[0], similar_items(2076, 20)))

In [15]:
for anime in anime_recommend:
    print(anime)

Takamiya Nasuno Desu!: Teekyuu Spin-off
Kindaichi Shounen no Jikenbo Returns
Kindaichi Shounen no Jikenbo Returns 2nd Season
Ring ni Kakero 1
Teekyuu 8
Teekyuu 6
Teekyuu 7
Ring ni Kakero 1: Nichibei Kessen-hen
Teekyuu 5
Haunted Junction
Ring ni Kakero 1: Kage Dou-hen
Tonkatsu DJ Agetarou
Mini Hama: Minimum Hamatora
Noramimi
Kara The Animation
Kaitou Tenshi Twin Angel: Kyun Kyun☆Tokimeki Paradise!!
Lemon Angel (1988/II)
Saa Ikou! Tamagotchi
Juusou Kikou Dancougar Nova
Kakko Kawaii Sengen! 2


- Top 20 Similar Anime của Anime có id là 2076

In [16]:
train_rating = pd.read_csv('Dataset/train_rating.csv', index_col=0)
test_rating = pd.read_csv('Dataset/test_rating.csv', index_col=0)

In [17]:

from collections import defaultdict

liked_threshold = 8          
limit_liked_per_user = 30    
rank_decay = 0.9             
k_recommend = 20             

df_likes = (train_rating[
        (train_rating['rating'] >= liked_threshold) | (train_rating['rating'] == -1)
    ]
            .sort_values(['user_id', 'rating'], ascending=[True, False]))
user_likes_map = df_likes.groupby('user_id')['anime_id'].apply(list).to_dict()

item_popularity = train_rating['anime_id'].value_counts()
cold_start_top = [int(a) for a in item_popularity.index if int(a) in row_of][:k_recommend]

def recommend_for_user(user_id: int,
                            k: int = k_recommend,
                            rank_decay_val: float = rank_decay,
                            l_cap: int = limit_liked_per_user):
    liked_ids = user_likes_map.get(int(user_id), [])
    if not liked_ids:
        return cold_start_top[:k]

    # cắt L theo rating cao nhất trước
    if l_cap is not None and len(liked_ids) > l_cap:
        liked_ids = liked_ids[:l_cap]

    liked_rows = np.fromiter((row_of[a] for a in liked_ids if a in row_of), dtype=np.int32, count=-1)
    if liked_rows.size == 0:
        return cold_start_top[:k]

    cand_idx = nbr_idx[liked_rows]    # (L, K)
    cand_sim = nbr_sim[liked_rows]    # (L, K)

    if rank_decay_val != 1.0:
        ranks = np.arange(cand_idx.shape[1], dtype=np.float32)  # 0..K-1
        cand_sim = cand_sim * (rank_decay_val ** ranks)[None, :]

    # cộng dồn vector hoá
    scores = np.zeros(n_items, dtype=np.float32)
    np.add.at(scores, cand_idx.ravel(), cand_sim.ravel())

    # loại item đã xem
    scores[liked_rows] = -np.inf

    # top-k bằng argpartition
    topk_idx = np.argpartition(-scores, k)[:k]
    topk_idx = topk_idx[np.argsort(-scores[topk_idx])]
    return [int(item_ids[i]) for i in topk_idx]


In [18]:


from joblib import Parallel, delayed

all_users = np.array(list(user_likes_map.keys()), dtype=np.int64)

def _one(u):
    return int(u), recommend_for_user(int(u), k=k_recommend)

predicted_list = Parallel(n_jobs=-1, backend="loky", batch_size=256)(
    delayed(_one)(u) for u in all_users
)
predicted = dict(predicted_list)





In [19]:
ground_truth = (
    test_rating
    .groupby('user_id')['anime_id']
    .apply(set)
    .to_dict()
)

In [20]:
result = metrics_eval.evaluate_all(predicted, ground_truth, 15)
print(result)

{'Precision@15': 0.05221795451122157, 'Recall@15': 0.0716569991881475, 'MAP@15': 0.03175242839238951}


- Đã chạy lại (6)