Followed this tutorial: https://www.kdnuggets.com/2020/08/content-based-recommendation-system-word-embeddings.html

In [1]:
import numpy as np
import pandas as pd

In [2]:
anime_data = pd.read_csv('data/anime.csv')
animes = pd.read_csv('data/anime_with_synopsis.csv')

animes

Unnamed: 0,MAL_ID,Name,Score,Genres,Synopsis
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized sever..."
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space","other day, another bounty—such is the life of ..."
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen","Vash the Stampede is the man with a $$60,000,0..."
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, ...",ches are individuals with special powers like ...
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",It is the dark century and the people are suff...
...,...,...,...,...,...
16209,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",No synopsis information has been added to this...
16210,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",ko is a typical high school student whose life...
16211,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supe...",Sequel to Higurashi no Naku Koro ni Gou .
16212,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",New Yama no Susume anime.


In [3]:
# Remove anime where the synopsis is NaN

animes.dropna(subset=['Synopsis'], inplace=True)

In [4]:
animes['Synopsis'] = animes['Synopsis'].str.lower()

In [5]:
# Tokenize entries

from nltk.tokenize import word_tokenize
animes['Synopsis'] = animes['Synopsis'].apply(word_tokenize)

In [6]:
# Remove stopwords
from nltk.corpus import stopwords

custom_stopwords = ['s', 'source', 'ann', 'episode', 'series', 'anime', 'synopsis']
custom_stopwords.extend(stopwords.words('english'))

animes['Synopsis'] = animes['Synopsis'].apply(lambda x: [word for word in x if word not in custom_stopwords])

In [7]:
# Remove punctuation

import string
punc = string.punctuation.replace('.', '') # keep the periods as it will be used to split into sentences

animes['Synopsis'] = animes['Synopsis'].apply(lambda x : [word.translate(str.maketrans('', '', punc)) for word in x])
animes['Synopsis'] = animes['Synopsis'].apply(lambda x : [word for word in x if len(word) > 0])

In [8]:
# Lemmatisation

# # If not downloaded:
# import nltk
# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
animes['Synopsis'] = animes['Synopsis'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

In [9]:
# Join together synopses

animes['Synopsis'] = animes['Synopsis'].apply(lambda x: ' '.join(x))

In [10]:
# Generate corpus by splitting synopses into sentences and tokenizing

corpus = animes['Synopsis'].str.cat(sep=' . ').split('.') # join with periods in case an entry doesn't end in one
corpus = list(map(word_tokenize, corpus))

In [11]:
# Remove periods from synopses

animes['Synopsis'] = animes['Synopsis'].str.replace('.', '', regex=True)

In [12]:
# Create word2vec model

from gensim.models import Word2Vec
model = Word2Vec(corpus, vector_size=100, window=10, sg=1, hs=0, negative=5, min_count=2, workers=16, epochs=10)

In [13]:
# Generate the average word2vec for the each book description

def vectors():
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []
    
    # Reading in each anime synopsis 
    for index, line in enumerate(animes['Synopsis']):
        avgword2vec = None
        count = 0
        for word in line.split():
            if word in model.wv.key_to_index:
                count += 1
                # add the vector of the word to the running average
                if avgword2vec is None:
                    avgword2vec = model.wv[word]
                else:
                    avgword2vec = avgword2vec + model.wv[word]
        
        if avgword2vec is not None:
            # Divide by the count for an average
            avgword2vec = avgword2vec / count
            
            word_embeddings.append(avgword2vec)

In [14]:
# Recommend similar anime

from sklearn.metrics.pairwise import cosine_similarity

def recommendations(anime):
    
    # Calling the function vectors
    vectors()
    
    # finding cosine similarity for the vectors
    cosine_similarities = cosine_similarity(word_embeddings, word_embeddings)
    
    # find index of anime
    index = animes.loc[animes['Name'] == anime].index[0]
    
    # get similar anime
    sim_scores = pd.Series(cosine_similarities[index])
    sim_anime_ids = animes.iloc[sim_scores.index]['MAL_ID']
    
    # display the result
    anime_information = anime_data[anime_data['MAL_ID'].isin(sim_anime_ids)]
    
    anime_information.insert(0, 'Score', sim_scores.values) # add score column
    anime_information = anime_information.sort_values('Score', ascending=False) # sort by score
    
    anime_information = anime_information.iloc[1:] # Remove the first anime (since the most similar anime is the anime itself)
    anime_information.reset_index(drop=True, inplace=True) # reset indices
    
    return anime_information

In [15]:
recs = recommendations('Shigatsu wa Kimi no Uso')
recs.head(10)

Unnamed: 0,Score,MAL_ID,Name,Average Rating,Genres,English name,Japanese name,Type,Episodes,Premiered,Studios,Members,Favorites
0,0.95679,34177,Tenshi no 3P!,6.66,"Music, School, Slice of Life",Angel's 3Piece!,天使の3P！〈スリーピース〉,TV,12,Summer 2017,Project No.9,49863,77
1,0.953683,29511,Ongaku Shoujo,6.24,"Music, Slice of Life",Unknown,音楽少女,Movie,1,Unknown,Studio Deen,11297,8
2,0.947712,1698,Nodame Cantabile,8.32,"Music, Slice of Life, Comedy, Drama, Romance, ...",Nodame Cantabile,のだめカンタービレ,TV,23,Winter 2007,J.C.Staff,262005,4913
3,0.947228,12531,Sakamichi no Apollon,8.36,"Drama, Josei, Music, Romance, School",Kids on the Slope,坂道のアポロン,TV,12,Spring 2012,"Tezuka Productions, MAPPA",284901,5914
4,0.94497,37541,Kimi no Iru Basho,6.01,"Sci-Fi, Music",Unknown,君のいる場所,Music,1,Unknown,Unknown,339,0
5,0.940135,30913,Mekakucity Days,7.51,"Music, Psychological, Sci-Fi",Mekakucity Days,メカクシティデイズ,Music,5,Unknown,Unknown,7951,73
6,0.939203,13759,Sakura-sou no Pet na Kanojo,8.19,"Slice of Life, Comedy, Drama, Romance, School",The Pet Girl of Sakurasou,さくら荘のペットな彼女,TV,24,Fall 2012,J.C.Staff,920871,25403
7,0.939028,34972,Hakubo,6.37,Drama,Twilight,薄暮,Movie,1,Unknown,Twilight Studio,16631,18
8,0.93881,36793,3D Kanojo: Real Girl,6.78,"Romance, School, Shoujo",Real Girl,３Ｄ彼女　リアルガール,TV,12,Spring 2018,Hoods Entertainment,210853,1034
9,0.938121,25801,Shouwa Genroku Rakugo Shinjuu: Yotarou Hourou-hen,7.6,"Drama, Josei",Unknown,昭和元禄落語心中 与太郎放浪篇,OVA,2,Unknown,Studio Deen,8788,12


In [16]:
# recs.to_csv('content-based-recs.csv',index=False)