In [15]:
import numpy as np

import pandas as pd

from tqdm import tqdm

import spacy

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tqdm.pandas()

In [8]:
df = pd.read_csv('raw/anime_with_synopsis.csv', index_col='MAL_ID', dtype=str)

In [9]:
nlp = spacy.load('en_core_web_sm')

In [14]:
sypnopsis = df.sypnopsis.dropna()

In [17]:
docs = sypnopsis.progress_apply(nlp)

100%|██████████| 16206/16206 [02:19<00:00, 116.35it/s]


In [18]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS

def get_tokens(doc):
    return [
        token.text
        for token in doc
    ]
    
def get_lemmas(doc):
    return ' '.join([
        token.lemma_
        for token in doc
        if token.lemma_ not in stopwords
        and token.lemma_.isalpha()
    ])

In [19]:
lemmas = docs.apply(get_lemmas)

In [48]:
tfidf = TfidfVectorizer(min_df=100, max_df=0.9)

In [49]:
vec_data = tfidf.fit_transform(lemmas)

In [50]:
cos_sim = cosine_similarity(vec_data)
cos_sim = pd.DataFrame(cos_sim, index=lemmas.index, columns=lemmas.index)

In [51]:
cos_sim.index

Int64Index([    1,     5,     6,     7,     8,    15,    16,    17,    18,
               19,
            ...
            48456, 48466, 48470, 48471, 48480, 48481, 48483, 48488, 48491,
            48492],
           dtype='int64', name='MAL_ID', length=16206)

In [59]:
sizes = {
    idx: row[(row > np.percentile(row, 99.9))&(row < 1)].size
    for idx, row in cos_sim.iterrows()
}

In [60]:
size = np.median([*sizes.values()]).astype(int)

In [61]:
size

16

In [62]:
similarities = {
    idx: row[~np.isclose(row, 1)].nlargest(size)
    for idx, row in cos_sim.iterrows()
}

In [63]:
anime_id = pd.DataFrame({
    key: value.index.values
    for key, value in similarities.items()
}).T

anime_weight = pd.DataFrame({
    key: value.values / value.values.sum()
    for key, value in similarities.items()
}).T

  key: value.values / value.values.sum()


In [64]:
weights = pd.concat((anime_id, anime_weight), keys=('MAL_ID', 'WEIGHT'), axis=1)

In [65]:
weights.to_csv('models/weights.csv')