In [1]:
import numpy as np

import pandas as pd

from tqdm import tqdm

import spacy

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
tqdm.pandas()

In [3]:
df = pd.read_csv('archive/anime_with_synopsis.csv', index_col='MAL_ID', dtype=str)

In [4]:
nlp = spacy.load('en_core_web_sm')

In [5]:
sypnopsis = df.sypnopsis.dropna()

In [6]:
docs = sypnopsis.progress_apply(nlp)

100%|██████████| 16206/16206 [02:21<00:00, 114.79it/s]


In [7]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

def get_tokens(doc):
    return [
        token.text
        for token in doc
    ]
    
def get_lemmas(doc):
    return ' '.join([
        token.lemma_
        for token in doc
        if token.lemma_ not in stopwords
        and token.lemma_.isalpha()
    ])

In [8]:
lemmas = docs.apply(get_lemmas)

In [9]:
tfidf = TfidfVectorizer(min_df=100, max_df=0.9, ngram_range=(2, 10))

In [10]:
vec_data = tfidf.fit_transform(lemmas)

In [11]:
cos_sim = cosine_similarity(vec_data)
cos_sim = pd.DataFrame(cos_sim, index=lemmas.index, columns=lemmas.index)

In [12]:
cos_sim.index

Int64Index([    1,     5,     6,     7,     8,    15,    16,    17,    18,
               19,
            ...
            48456, 48466, 48470, 48471, 48480, 48481, 48483, 48488, 48491,
            48492],
           dtype='int64', name='MAL_ID', length=16206)

In [13]:
sizes = {
    idx: row[(row > np.percentile(row, 99.9))&(row < 1)].size
    for idx, row in cos_sim.iterrows()
}

In [14]:
size = np.median([*sizes.values()]).astype(int)

In [15]:
size

0

In [16]:
similarities = {
    idx: row[~np.isclose(row, 1)].nlargest(size)
    for idx, row in cos_sim.iterrows()
}

In [17]:
anime_id = pd.DataFrame({
    key: value.index.values
    for key, value in similarities.items()
}).T

anime_weight = pd.DataFrame({
    key: value.values / value.values.sum()
    for key, value in similarities.items()
}).T

In [18]:
weights = pd.concat((anime_id, anime_weight), keys=('MAL_ID', 'WEIGHT'), axis=1)

In [19]:
weights.to_csv('models/weights.csv')