In [69]:
import os
from itertools import combinations

import Levenshtein
import numpy as np
import polars as pl
from IPython.display import display
from scipy.cluster.hierarchy import DisjointSet
from tqdm import tqdm

from src.config import Config

pl.Config.set_tbl_rows(50)
cfg = Config.get_cnf()


In [78]:
anime_df = pl.read_csv(os.path.join("../", cfg.data.anime_path), try_parse_dates=True)
# Unknownという文字列をNoneに変換
anime_df = anime_df.with_columns(
    pl.when(pl.col("japanese_name") == "Unknown").then(None).otherwise(pl.col("japanese_name")).alias("japanese_name")
)
anime_df.head()


anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch
str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""000ba7f7e34e107e7544""","""Comedy, Sci-Fi, Seinen, Slice …","""宇宙兄弟""","""TV""","""99""","""Apr 1, 2012 to Mar 22, 2014""","""Aniplex, Dentsu, YTV, Trinity …","""Sentai Filmworks""","""A-1 Pictures""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",150428,16552,37234,13009,6948,76685
"""00427279d72064e7fb69""","""Adventure, Slice of Life, Myst…","""蟲師""","""TV""","""26""","""Oct 23, 2005 to Jun 19, 2006""","""Avex Entertainment, Marvelous,…","""Funimation""","""Artland""","""Manga""","""25 min. per ep.""","""PG-13 - Teens 13 or older""",620736,55482,235371,42786,20017,267080
"""00444b67aaabdf740a68""","""Adventure, Slice of Life, Myst…","""蟲師 続章""","""TV""","""10""","""Apr 5, 2014 to Jun 21, 2014""","""Aniplex, Kodansha, Delfi Sound""","""Aniplex of America""","""Artland""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",226522,12585,113559,6095,2606,91677
"""00839a3507ab168abe75""","""Comedy, Ecchi, Fantasy, School""","""星刻の竜騎士""","""TV""","""12""","""Apr 5, 2014 to Jun 21, 2014""","""Media Factory, AT-X, Sony Musi…","""Funimation""","""C-Station""","""Light novel""","""24 min. per ep.""","""R+ - Mild Nudity""",170220,8723,118202,3753,8034,31508
"""0192331235e110fe4f76""","""Comedy, Harem, Romance, Sci-Fi…","""天地無用！""","""TV""","""26""","""Apr 2, 1995 to Sep 24, 1995""","""TV Tokyo, Pioneer LDC""","""Funimation, Geneon Entertainme…","""AIC""","""Original""","""23 min. per ep.""","""PG-13 - Teens 13 or older""",62599,2565,39890,2093,1986,16065


In [87]:
def get_original_work_name(df, threshold=0.35):
    _feature = df["japanese_name"].to_list()
    _n = df.height

    _disjoint_set = DisjointSet(list(range(_n)))
    for i, j in tqdm(combinations(range(_n), 2)):
        if _feature[i] is np.nan or _feature[j] is np.nan:
            lv_dist, jw_dist = 0.5, 0.5
        else:
            # my change: 距離ではなく、類似度を計算する = 類似度が高いほど距離は小さくなる
            lv_dist = 1 - Levenshtein.ratio(_feature[i], _feature[j])
            jw_dist = 1 - Levenshtein.jaro_winkler(_feature[i], _feature[j])
        _d = (lv_dist + jw_dist) / 2

        if _d < threshold:
            _disjoint_set.merge(i, j)

    _labels = [None] * _n
    for subset in _disjoint_set.subsets():
        # label = _feature[list(subset)[0]]
        # my change: 類似するタイトル群の中で最も短いタイトルを採用する
        label = min([_feature[i] for i in subset if _feature[i] is not None], key=len, default=None)
        for element in subset:
            _labels[element] = label
    df = df.with_columns(pl.Series("original_work_name", _labels))
    # df["original_work_name"] = _labels

    return df


processed_anime_df = get_original_work_name(anime_df)
print(f"raw - japanese_name nunique: {anime_df['japanese_name'].n_unique()}")
print(f"processed - japanese_name nunique: {processed_anime_df['original_work_name'].n_unique()}")
display(processed_anime_df.head(4))


0it [00:00, ?it/s]

1999000it [00:02, 803204.39it/s]

raw - japanese_name nunique: 1931
processed - japanese_name nunique: 1304





anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,original_work_name
str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""000ba7f7e34e107e7544""","""Comedy, Sci-Fi, Seinen, Slice …","""宇宙兄弟""","""TV""","""99""","""Apr 1, 2012 to Mar 22, 2014""","""Aniplex, Dentsu, YTV, Trinity …","""Sentai Filmworks""","""A-1 Pictures""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",150428,16552,37234,13009,6948,76685,"""宇宙兄弟"""
"""00427279d72064e7fb69""","""Adventure, Slice of Life, Myst…","""蟲師""","""TV""","""26""","""Oct 23, 2005 to Jun 19, 2006""","""Avex Entertainment, Marvelous,…","""Funimation""","""Artland""","""Manga""","""25 min. per ep.""","""PG-13 - Teens 13 or older""",620736,55482,235371,42786,20017,267080,"""蟲師"""
"""00444b67aaabdf740a68""","""Adventure, Slice of Life, Myst…","""蟲師 続章""","""TV""","""10""","""Apr 5, 2014 to Jun 21, 2014""","""Aniplex, Kodansha, Delfi Sound""","""Aniplex of America""","""Artland""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",226522,12585,113559,6095,2606,91677,"""蟲師"""
"""00839a3507ab168abe75""","""Comedy, Ecchi, Fantasy, School""","""星刻の竜騎士""","""TV""","""12""","""Apr 5, 2014 to Jun 21, 2014""","""Media Factory, AT-X, Sony Musi…","""Funimation""","""C-Station""","""Light novel""","""24 min. per ep.""","""R+ - Mild Nudity""",170220,8723,118202,3753,8034,31508,"""星刻の竜騎士"""


In [88]:
processed_anime_df.group_by("original_work_name").len().filter(pl.col("len") > 1).sort("len", descending=True)


original_work_name,len
str,u32
"""モンスター""",27
"""ドラゴンボール""",23
"""ARIA The NATURAL""",13
"""進撃の巨人""",13
"""ハイスコアガール""",12
"""僕のヒーローアカデミア""",12
"""ディアーズ""",12
"""ソードアート・オンライン""",10
"""機動戦士ガンダム""",10
"""食戟のソーマ""",8


In [91]:
processed_anime_df.filter(pl.col("original_work_name") == "ハイスコアガール")


anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,original_work_name
str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""06fbaba4843c19234002""","""Ecchi, Comedy, Harem, Romance,…","""ハイスクールD×Dスペシャル""","""Special""","""6""","""Mar 21, 2012 to Aug 29, 2012""","""Genco, Lantis""","""Funimation""","""TNK""","""Light novel""","""4 min. per ep.""","""R+ - Mild Nudity""",157348,3103,138426,1149,925,13745,"""ハイスコアガール"""
"""0928df168d8caf6276e7""","""Comedy, Demons, Ecchi, Harem, …","""ハイスクールD×D""","""TV""","""12""","""Jan 6, 2012 to Mar 23, 2012""","""Genco, Lantis, AT-X, PRA, Show…","""Funimation""","""TNK""","""Light novel""","""24 min. per ep.""","""R+ - Mild Nudity""",995532,34261,812823,14742,34031,99675,"""ハイスコアガール"""
"""387c100fbeea11422f16""","""Action, Comedy, Demons, Ecchi,…","""ハイスクールDxD HERO""","""TV""","""12""","""Apr 17, 2018 to Jul 3, 2018""","""AT-X""","""Funimation""","""Passione""","""Light novel""","""23 min. per ep.""","""R+ - Mild Nudity""",309467,22899,202846,7545,11680,64497,"""ハイスコアガール"""
"""4d775dfaa13d3fa071da""","""Comedy, Game, Romance, School,…","""ハイスコアガール II""","""TV""","""9""","""Oct 26, 2019 to Dec 21, 2019""","""Square Enix, Mainichi Broadcas…","""Netflix""","""J.C.Staff""","""Manga""","""23 min. per ep.""","""PG-13 - Teens 13 or older""",83181,4663,54935,1463,1004,21116,"""ハイスコアガール"""
"""81ea3d02f77c804fe686""","""Action, Ecchi, Comedy, Harem, …","""ハイスクールD×D BorN OVA 蘇らない不死鳥""","""OVA""","""1""","""Dec 9, 2015""","""Genco, Lantis""","""Unknown""","""TNK""","""Light novel""","""24 min.""","""R+ - Mild Nudity""",92915,1843,75467,669,479,14457,"""ハイスコアガール"""
"""9af8ef7dd84dda6b4896""","""Comedy, Demons, Ecchi, Romance…","""ハイスクールD×D OVA""","""OVA""","""2""","""Sep 6, 2012 to May 31, 2013""","""Genco, Lantis, AT-X""","""Unknown""","""TNK""","""Light novel""","""23 min. per ep.""","""R+ - Mild Nudity""",209100,14602,168222,3212,1731,21333,"""ハイスコアガール"""
"""a00cec15a2e6d6214d97""","""Action, Harem, Comedy, Demons,…","""ハイスクールD×D NEW""","""TV""","""12""","""Jul 7, 2013 to Sep 22, 2013""","""Genco, AT-X, Fujimi Shobo""","""Funimation""","""TNK""","""Light novel""","""24 min. per ep.""","""R+ - Mild Nudity""",668482,19634,585660,7406,9498,46284,"""ハイスコアガール"""
"""b5af4da89b1b7f6bf52f""","""Game, Comedy, Romance, School,…","""ハイスコアガール""","""TV""","""12""","""Jul 14, 2018 to Sep 29, 2018""","""Square Enix, Warner Bros. Japa…","""Netflix""","""J.C.Staff""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",169402,9893,102347,4306,6644,46212,"""ハイスコアガール"""
"""bdf7bfe317605327fe97""","""Comedy, Demons, Ecchi, Romance…","""ハイスクールD×D NEW OVA おっぱい、包みます！""","""OVA""","""1""","""Mar 10, 2015""","""Genco""","""Unknown""","""TNK""","""Light novel""","""24 min.""","""R+ - Mild Nudity""",116137,2139,96284,772,600,16342,"""ハイスコアガール"""
"""d8a6c6bcf24ad2414e45""","""Action, Comedy, Demons, Ecchi,…","""ハイスクールD×D BorN""","""TV""","""12""","""Apr 4, 2015 to Jun 20, 2015""","""Genco, Lantis, AT-X, Fujimi Sh…","""Funimation""","""TNK""","""Light novel""","""24 min. per ep.""","""R+ - Mild Nudity""",571266,20004,484271,7532,9772,49687,"""ハイスコアガール"""
