# Importing the necessary libraries

In [2]:
# from transformers import BertTokenizer, BertModel
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.decomposition import PCA

import torch
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pandas as pd
import numpy as np
import torch
import warnings
import faiss

from tqdm import tqdm

warnings.filterwarnings("ignore")

# Reading data

In [15]:


df = pd.read_csv("/kaggle/input/manga-information-dataset/manga_dataset.csv")

df.isna().sum()


id                  0
image_url           0
title               0
score               0
rank             7586
synopsis         7782
background      38017
genres              0
themes              0
demographics        0
status              0
authors             0
chapters        13289
volumes         10345
dtype: int64

In [16]:
data = df[["title","score","synopsis","genres","background","themes","authors","demographics"]]
data.head()


0                    ['Award Winning', 'Drama', 'Mystery']
1        ['Action', 'Adventure', 'Award Winning', 'Dram...
2          ['Award Winning', 'Drama', 'Mystery', 'Sci-Fi']
3             ['Award Winning', 'Sci-Fi', 'Slice of Life']
4                              ['Award Winning', 'Sports']
                               ...                        
48120                                          ['Fantasy']
48121                             ['Boys Love', 'Erotica']
48122                             ['Boys Love', 'Erotica']
48123                                ['Comedy', 'Romance']
48124                       ['Comedy', 'Fantasy', 'Ecchi']
Name: genres, Length: 48125, dtype: object

# Data Preprocessing
1. Converting strings to list
2. Creating features for recommendation (ie tags)

In [17]:

def list_to_str(row):
  ##Converting string to list
  lst = ast.literal_eval(row)
  ##Converting list to string
  str1 = ",".join(lst)
  return str1
data["genres"] = data["genres"].apply(list_to_str)
data["themes"] = data["themes"].apply(list_to_str)
data["authors"] = data["authors"].apply(list_to_str)
data["demographics"] = data["demographics"].apply(list_to_str)


def create_tags(row):
  tags = row["genres"] +"," +row["themes"] + "," + row["authors"] +"," + row["demographics"]
  if pd.notna(row["synopsis"]):
    tags += "," + row["synopsis"]
  return tags

data["tags"] = data.apply(create_tags,axis = 1)




# Generating WordEmbeddings using DistillBert Model

In [18]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased').to(device)
def get_bert_embedding(text):
  #Tokenize the inputs
  inputs = tokenizer(text,return_tensors = "pt",truncation = True,padding = True,max_length  = 100)
  inputs = {k: v.to(device) for k, v in inputs.items()}
  ##Bert Model output
  with torch.no_grad():
    outputs = model(**inputs)

  cls_embedding = outputs.last_hidden_state[:,0,:].cpu().numpy()

  return cls_embedding.flatten()
tqdm.pandas()
data["embedding_distillBert"] = data["tags"].progress_apply(get_bert_embedding)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

100%|██████████| 48125/48125 [06:30<00:00, 123.09it/s]


In [None]:
def convert_to_list(row):
    row = row.replace(" ",",").replace(",,",",")
    return [float(x) for x in row.strip('[]').split(',') if x.strip()]
data["embedding_distillBert"] = data["embedding_distillBert"].apply(convert_to_list)
embeddings = np.array(data["embedding_distillBert"].tolist(),dtype = "float32")
embeddings.shape

# Reducing the dimension of the embeddings using PCA

In [None]:
pca = PCA(n_components=530)
embeddings_pc = pca.fit_transform(embeddings)

# FAISS Search with Cosine Similarity

In [None]:
##Normalizing the embeddings
faiss.normalize_L2(embeddings_pc)
index = faiss.IndexFlatIP(embeddings_pc.shape[1])
index.add(embeddings_pc)
print(f"Number of embeddings added to index : {index.ntotal}")


# Saving the embeddings and the faiss index

In [None]:
np.save("embeddings_pca.npy",embeddings_pc)
faiss.write_index(index, "faiss_index.bin")
