In [346]:
import pandas as pd
from jikanpy import Jikan
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
from matplotlib import pyplot as plt
import numpy as np
import os
import yaml
import ast
import difflib

jikan = Jikan()
columns = ['type','mal_id','title_english','popularity','rating','genres','studios','themes','demographics','synopsis']
file = './top_anime.csv'
pd.options.mode.chained_assignment = None  # default='warn'

In [347]:
def cell_splitter(temp_df):
    for i, values in enumerate(temp_df):
        if isinstance(values, str):
            values = yaml.load(values,Loader=yaml.Loader)
        value_list = []
        for value in values:
            value_list.append(value['name'])
        if len(values) == 1:
            value_list = value_list[0]
        temp_df.iloc[i] = value_list

    return temp_df

In [348]:
def nlp(text, num_of_words=10):
    text = text.values[0]
    cv = CountVectorizer(stop_words='english')
    dtm = cv.fit_transform([text])
    # Build LDA Model with GridSearch params
    lda_model = LatentDirichletAllocation(n_components=1,
                                        learning_decay=0.5,
                                        max_iter=50,
                                        learning_method='online',
                                        random_state=50,
                                        batch_size=100,
                                        evaluate_every = -1,
                                        n_jobs = -1)
    lda_output = lda_model.fit_transform(dtm)
    for topic in lda_model.components_:
        words = [cv.get_feature_names_out()[i] for i in topic.argsort()][-num_of_words:]
    return words

In [397]:
def anime_search(search,df=None):
    if df.empty:
        results = jikan.search('anime', search)
        df = pd.DataFrame(results['data'])
    else:
        search = search.lower()
        df[df['title'].str.contains(search)]
    temp_df = df[columns].dropna(),
    temp_df = temp_df[temp_df['type'] == 'TV']
    temp_df.drop('type',axis=1,inplace=True)
    temp_df.reset_index(drop=True)

    temp_df['genres'] = cell_splitter(temp_df['genres'])
    temp_df['studios'] = cell_splitter(temp_df['studios'])
    temp_df['themes'] = cell_splitter(temp_df['themes'])
    temp_df['demographics'] = cell_splitter(temp_df['demographics'])

    temp_df['synopsis'].iloc[0] = nlp(temp_df['synopsis'])

    return temp_df

In [376]:
def nlp(text, num_of_words=10):
    text = text.values[0]
    cv = CountVectorizer(stop_words='english')
    dtm = cv.fit_transform([text])
    # Build LDA Model with GridSearch params
    lda_model = LatentDirichletAllocation(n_components=1,
                                        learning_decay=0.5,
                                        max_iter=50,
                                        learning_method='online',
                                        random_state=50,
                                        batch_size=100,
                                        evaluate_every = -1,
                                        n_jobs = -1)
    lda_output = lda_model.fit_transform(dtm)
    for topic in lda_model.components_:
        words = [cv.get_feature_names_out()[i] for i in topic.argsort()][-num_of_words:]
    return words

In [350]:
def top_anime(pages=80):
    if pages==80 and os.path.exists(file):
        df = pd.read_csv(file,index_col=0)
    else:
        df = pd.DataFrame()
        for i in range(pages):
            results = jikan.top(type='anime',page=i)
            new_df = pd.DataFrame(results['data'])
            df = pd.concat([df,new_df], ignore_index=True)
            time.sleep(1)
        df.to_csv(file)

    pop_df = df[columns].dropna()
    pop_df = pop_df[pop_df['type'] == 'TV']
    pop_df.drop('type',axis=1,inplace=True)
    pop_df['genres'] = cell_splitter(pop_df['genres'])
    pop_df['studios'] = cell_splitter(pop_df['studios'])
    pop_df['themes'] = cell_splitter(pop_df['themes'])
    pop_df['demographics'] = cell_splitter(pop_df['demographics'])
    for j, synopsis in enumerate(pop_df['synopsis']):
        pop_df['synopsis'].iloc[j] = nlp(pd.Series(synopsis))

    return pop_df

In [396]:
fav_df = pd.DataFrame(columns = columns)
fav_df.drop('type',axis=1,inplace=True)
fav_anime = ['fullmetal alchemist: brotherhood','dragon ball z','steins;gate','psycho-pass','my hero academia','code geass: lelouch of','rurouni kenshin','attack on titan','madoka magica','one punch man']
search_df = None
sleep_timer = 1
if os.path.exists(file):
    search_df = pd.read_csv(file,index_col=0)
    sleep_timer = 0
for fav in fav_anime:
    print(fav)
    df = anime_search(fav, search_df)
    df.sort_values('popularity')
    temp_df = df.iloc[[0]]
    fav_df = pd.concat([fav_df,temp_df], ignore_index=True)
    time.sleep(sleep_timer)
fav_df

fullmetal alchemist: brotherhood
dragon ball z
steins;gate
psycho-pass
my hero academia
code geass: lelouch of rebellion
rurouni kenshin
attack on titan
puella magi madoka magica
one punch man


Unnamed: 0,mal_id,title_english,popularity,rating,genres,studios,themes,demographics,synopsis
0,5114,Fullmetal Alchemist: Brotherhood,3,R - 17+ (violence & profanity),"[Action, Adventure, Drama, Fantasy]",Bones,Military,Shounen,"[rewrite, exchange, winry, elric, alchemist, a..."
1,813,Dragon Ball Z,104,PG-13 - Teens 13 or older,"[Action, Adventure, Comedy, Fantasy]",Toei Animation,"[Martial Arts, Super Power]",Shounen,"[annihilated, martial, balls, protect, warns, ..."
2,9253,Steins;Gate,13,PG-13 - Teens 13 or older,"[Drama, Sci-Fi, Suspense]",White Fox,"[Psychological, Time Travel]",[],"[ire, using, caused, meets, send, friend, time..."
3,13601,Psycho-Pass,58,R - 17+ (violence & profanity),"[Action, Sci-Fi, Suspense]",Production I.G,"[Adult Cast, Detective, Psychological]",[],"[criminal, signs, jaded, akane, known, uphold,..."
4,31964,My Hero Academia,6,PG-13 - Teens 13 or older,Action,Bones,"[School, Super Power]",Shounen,"[months, chosen, prestigious, wanted, inherite..."
5,1575,Code Geass: Lelouch of the Rebellion,19,R - 17+ (violence & profanity),"[Action, Drama, Sci-Fi]",Sunrise,"[Mecha, Military, School, Super Power]",[],"[onslaught, rewrite, lamperouge, 11, area, jap..."
6,45,Rurouni Kenshin,414,PG-13 - Teens 13 or older,"[Action, Adventure, Comedy, Romance]","[Gallop, Studio Deen]","[Historical, Samurai]",Shounen,"[protecting, unmatched, kamiya, kaoru, years, ..."
7,16498,Attack on Titan,1,R - 17+ (violence & profanity),"[Action, Award Winning, Drama, Suspense]",Wit Studio,"[Gore, Military, Survival]",Shounen,"[born, join, shattered, enormous, titans, surv..."
8,9756,Puella Magi Madoka Magica,94,PG-13 - Teens 13 or older,"[Award Winning, Drama, Suspense]",Shaft,"[Mahou Shoujo, Psychological]",[],"[sayaka, dreams, mal, madoka, kyuubey, akemi, ..."
9,30276,One Punch Man,4,R - 17+ (violence & profanity),"[Action, Comedy]",Madhouse,"[Adult Cast, Parody, Super Power]",Seinen,"[member, capable, pursue, heroes, order, assoc..."


In [None]:
pop_df_file = './pop_df.csv'
if os.path.exists(pop_df_file):
    pop_df = pd.read_csv(pop_df_file,index_col=0)
else:
    pop_df = top_anime()
    pop_df = pop_df.drop_duplicates(subset=['title_english']).reset_index(drop=True)
    pop_df.to_csv(pop_df_file)
pop_df

In [None]:
indices = []
for id in fav_df['mal_id']:
    index = pop_df[pop_df['mal_id']==id].index[0]
    indices.append(index)

In [None]:
mlb = MultiLabelBinarizer()
ml_df = pd.get_dummies(pop_df,columns=['rating'])
genres = pd.DataFrame(mlb.fit_transform(pop_df['genres'].apply(lambda x: ast.literal_eval(x))),columns=mlb.classes_)
studios = pd.DataFrame(mlb.fit_transform(pop_df['studios'].apply(lambda x: ast.literal_eval(x))),columns=mlb.classes_)
themes = pd.DataFrame(mlb.fit_transform(pop_df['themes'].apply(lambda x: ast.literal_eval(x))),columns=mlb.classes_)
demographics = pd.DataFrame(mlb.fit_transform(pop_df['demographics'].apply(lambda x: ast.literal_eval(x))),columns=mlb.classes_)
synopsis = pd.DataFrame(mlb.fit_transform(pop_df['synopsis'].apply(lambda x: ast.literal_eval(x))),columns=mlb.classes_)
dummy_df = pd.concat([ml_df,genres,studios,themes,demographics,synopsis], axis=1).drop(['score','popularity','mal_id','title_english','genres','studios','themes','demographics','synopsis'], axis=1).fillna(0)
dummy_df


In [None]:
normalizer = MinMaxScaler()
normal = normalizer.fit_transform(dummy_df)
normal_df = pd.DataFrame(data=normal,columns=dummy_df.columns)

In [None]:
n_clusters = 38
model = AgglomerativeClustering(n_clusters=n_clusters, compute_distances=True)
y = model.fit_predict(normal_df)

In [398]:
pop_df['cluster'] = y
display(pop_df.iloc[indices])
display(pop_df[pop_df['cluster']==30])

Unnamed: 0,mal_id,title_english,score,popularity,rating,genres,studios,themes,demographics,synopsis,cluster
217,813,Dragon Ball Z,8.16,104,PG-13 - Teens 13 or older,"['Action', 'Adventure', 'Comedy', 'Fantasy']",['Toei Animation'],"['Martial Arts', 'Super Power']",['Shounen'],"['annihilated', 'martial', 'balls', 'protect',...",25
3,9253,Steins;Gate,9.08,13,PG-13 - Teens 13 or older,"['Drama', 'Sci-Fi', 'Suspense']",['White Fox'],"['Psychological', 'Time Travel']",[],"['ire', 'using', 'caused', 'meets', 'send', 'f...",22
128,13601,Psycho-Pass,8.34,58,R - 17+ (violence & profanity),"['Action', 'Sci-Fi', 'Suspense']",['Production I.G'],"['Adult Cast', 'Detective', 'Psychological']",[],"['criminal', 'signs', 'jaded', 'akane', 'known...",12
1,5114,Fullmetal Alchemist: Brotherhood,9.11,3,R - 17+ (violence & profanity),"['Action', 'Adventure', 'Drama', 'Fantasy']",['Bones'],['Military'],['Shounen'],"['rewrite', 'exchange', 'winry', 'elric', 'alc...",37
171,49918,My Hero Academia Season 6,8.25,573,PG-13 - Teens 13 or older,['Action'],['Bones'],"['School', 'Super Power']",['Shounen'],"['intel', 'confirms', 'rages', 'criminal', 'un...",30
14,2904,Code Geass: Lelouch of the Rebellion R2,8.91,47,R - 17+ (violence & profanity),"['Action', 'Drama', 'Sci-Fi']",['Sunrise'],"['Mecha', 'Military', 'Super Power']",[],"['written', 'lost', 'group', 'accept', 'missin...",8
153,45,Rurouni Kenshin,8.29,414,PG-13 - Teens 13 or older,"['Action', 'Adventure', 'Comedy', 'Romance']","['Gallop', 'Studio Deen']","['Historical', 'Samurai']",['Shounen'],"['protecting', 'unmatched', 'kamiya', 'kaoru',...",29
5,38524,Attack on Titan Season 3 Part 2,9.06,27,R - 17+ (violence & profanity),"['Action', 'Drama']",['Wit Studio'],"['Gore', 'Military', 'Survival']",['Shounen'],"['arlert', 'sacrifices', 'countless', 'strive'...",23
119,9756,Puella Magi Madoka Magica,8.36,94,PG-13 - Teens 13 or older,"['Award Winning', 'Drama', 'Suspense']",['Shaft'],"['Mahou Shoujo', 'Psychological']",[],"['sayaka', 'dreams', 'mal', 'madoka', 'kyuubey...",22
80,30276,One Punch Man,8.51,4,R - 17+ (violence & profanity),"['Action', 'Comedy']",['Madhouse'],"['Adult Cast', 'Parody', 'Super Power']",['Seinen'],"['member', 'capable', 'pursue', 'heroes', 'ord...",7


Unnamed: 0,mal_id,title_english,score,popularity,rating,genres,studios,themes,demographics,synopsis,cluster
22,37510,Mob Psycho 100 II,8.81,79,PG-13 - Teens 13 or older,"['Action', 'Comedy', 'Supernatural']",['Bones'],['Super Power'],[],"['strong', 'legends', 'anymore', 'tone', 'dark...",30
47,50172,Mob Psycho 100 III,8.66,505,PG-13 - Teens 13 or older,"['Action', 'Comedy', 'Supernatural']",['Bones'],['Super Power'],[],"['naivety', 'forward', 'assist', 'paranormal',...",30
81,30654,Assassination Classroom Second Season,8.5,85,PG-13 - Teens 13 or older,"['Action', 'Comedy']",['Lerche'],['School'],['Shounen'],"['works', 'hard', 'middle', 'head', 'prevent',...",30
88,32182,Mob Psycho 100,8.49,33,PG-13 - Teens 13 or older,"['Action', 'Comedy', 'Supernatural']",['Bones'],['Super Power'],[],"['order', 'change', 'realizes', 'young', 'grad...",30
168,50709,Lycoris Recoil,8.25,671,PG-13 - Teens 13 or older,['Action'],['A-1 Pictures'],[],[],"['unfortunately', 'belonged', 'number', 'situa...",30
171,49918,My Hero Academia Season 6,8.25,573,PG-13 - Teens 13 or older,['Action'],['Bones'],"['School', 'Super Power']",['Shounen'],"['intel', 'confirms', 'rages', 'criminal', 'un...",30
220,30503,Noragami Aragoto,8.16,76,PG-13 - Teens 13 or older,"['Action', 'Fantasy']",['Bones'],['Mythology'],['Shounen'],"['normal', 'hopes', 'cheerily', 'prepares', 'd...",30
233,1604,Reborn!,8.14,355,PG-13 - Teens 13 or older,"['Action', 'Comedy']",['Artland'],"['Organized Crime', 'Super Power']",['Shounen'],"['classmates', 'rewrite', 'idol', 'reborn', 'f...",30
234,34572,Black Clover,8.13,67,PG-13 - Teens 13 or older,"['Action', 'Comedy', 'Fantasy']",['Pierrot'],[],['Shounen'],"['tries', 'magic', 'power', 'wizard', 'clover'...",30
235,33486,My Hero Academia Season 2,8.13,14,PG-13 - Teens 13 or older,['Action'],['Bones'],"['School', 'Super Power']",['Shounen'],"['pitted', 'disrupt', 'sharp', 'midoriya', 'ac...",30


In [1]:
# xresults = jikan.search('anime', 'full metal alechmist: brotherhood')
# print(xresults)
# xdf = pd.DataFrame(xresults['data'])
# for col in xdf.columns:
#     print(col)
#     display(xdf[col])
from anime_clusters import get_cluster
df = get_cluster('Steins;Gate')
df

Unnamed: 0,mal_id,title_english,score,popularity,rating,genres,studios,themes,demographics,synopsis,cluster
3,9253,Steins;Gate,9.08,13,PG-13 - Teens 13 or older,"['Drama', 'Sci-Fi', 'Suspense']",['White Fox'],"['Psychological', 'Time Travel']",[],"['ire', 'using', 'caused', 'meets', 'send', 'f...",22
12,4181,Clannad: After Story,8.94,108,PG-13 - Teens 13 or older,"['Drama', 'Romance', 'Supernatural']",['Kyoto Animation'],[],[],"['produced', 'winter', 'graduate', 'purpose', ...",22
44,33352,Violet Evergarden,8.67,51,PG-13 - Teens 13 or older,"['Drama', 'Fantasy']",['Kyoto Animation'],[],[],"['written', 'guardian', 'new', 'memory', 'work...",22
79,30484,Steins;Gate 0,8.52,191,PG-13 - Teens 13 or older,"['Drama', 'Sci-Fi', 'Suspense']",['White Fox'],['Time Travel'],[],"['college', 'person', 'time', 'forum', 'simula...",22
107,37965,Run with the Wind,8.4,769,PG-13 - Teens 13 or older,"['Drama', 'Sports']",['Production I.G'],['Adult Cast'],[],"['unfortunately', 'involved', 'ekiden', 'stude...",22
119,9756,Puella Magi Madoka Magica,8.36,94,PG-13 - Teens 13 or older,"['Award Winning', 'Drama', 'Suspense']",['Shaft'],"['Mahou Shoujo', 'Psychological']",[],"['sayaka', 'dreams', 'mal', 'madoka', 'kyuubey...",22
125,39247,Miss Kobayashi's Dragon Maid S,8.34,398,PG-13 - Teens 13 or older,"['Fantasy', 'Slice of Life']",['Kyoto Animation'],[],[],"['radical', 'intrigued', 'maid', 'life', 'day'...",22
126,30,Neon Genesis Evangelion,8.34,46,PG-13 - Teens 13 or older,"['Action', 'Avant Garde', 'Award Winning', 'Dr...","['Gainax', 'Tatsunoko Production']","['Mecha', 'Psychological']",[],"['known', 'year', 'despite', 'evangelion', 'ge...",22
130,338,The Rose of Versailles,8.33,1986,PG-13 - Teens 13 or older,"['Drama', 'Romance']",['Tokyo Movie Shinsha'],"['Historical', 'Military']",['Shoujo'],"['alliance', 'men', 'revered', 'impoverished',...",22
137,5420,The Beast Player Erin,8.32,2080,PG-13 - Teens 13 or older,"['Drama', 'Fantasy', 'Slice of Life']","['Production I.G', 'Trans Arts']",[],[],"['shin', 'tensions', 'war', 'army', 'mother', ...",22
