In [1]:
import os
from itertools import combinations

import Levenshtein
import numpy as np
import polars as pl
from IPython.display import display
from scipy.cluster.hierarchy import DisjointSet
from tqdm import tqdm

from src.config import Config

pl.Config.set_tbl_rows(50)
cfg = Config.get_cnf()


In [20]:
anime_df = pl.read_csv(os.path.join("../", cfg.data.anime_path), try_parse_dates=True)
# Unknownという文字列をNoneに変換
anime_df = anime_df.with_columns(
    pl.when(pl.col("japanese_name") == "Unknown").then(None).otherwise(pl.col("japanese_name")).alias("japanese_name")
)
anime_df.head(1)


anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch
str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64
"""000ba7f7e34e107e7544""","""Comedy, Sci-Fi, Seinen, Slice …","""宇宙兄弟""","""TV""","""99""","""Apr 1, 2012 to Mar 22, 2014""","""Aniplex, Dentsu, YTV, Trinity …","""Sentai Filmworks""","""A-1 Pictures""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",150428,16552,37234,13009,6948,76685


In [21]:
def get_original_work_name(df, threshold=0.3):
    _feature = df["japanese_name"].to_list()
    # my change: ソースが一致していないものは距離を1にする（原作が別とする）
    _source = df["source"].to_list()
    _n = df.height

    _disjoint_set = DisjointSet(list(range(_n)))
    for i, j in tqdm(combinations(range(_n), 2)):
        if _feature[i] is np.nan or _feature[j] is np.nan:
            lv_dist, jw_dist = 1, 1
        elif _source[i] != _source[j]:
            lv_dist, jw_dist = 1, 1
        else:
            # my change: 距離ではなく、類似度を計算する = 類似度が高いほど距離は小さくなる
            lv_dist = 1 - Levenshtein.ratio(_feature[i], _feature[j])
            jw_dist = 1 - Levenshtein.jaro_winkler(_feature[i], _feature[j])
        _d = (lv_dist + jw_dist) / 2

        if _d < threshold:
            _disjoint_set.merge(i, j)

    _labels = [None] * _n
    for subset in _disjoint_set.subsets():
        # my change: 類似するタイトル群の中で最も短いタイトルを採用する
        label = min([_feature[i] for i in subset if _feature[i] is not None], key=len, default=None)
        for element in subset:
            _labels[element] = label
    df = df.with_columns(pl.Series("original_work_name", _labels))

    return df


processed_anime_df = get_original_work_name(anime_df)
print(f"raw - japanese_name nunique: {anime_df['japanese_name'].n_unique()}")
print(f"processed - japanese_name nunique: {processed_anime_df['original_work_name'].n_unique()}")
display(processed_anime_df.head(3))


0it [00:00, ?it/s]

1999000it [00:01, 1912447.29it/s]

raw - japanese_name nunique: 1931
processed - japanese_name nunique: 1436





anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,original_work_name
str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""000ba7f7e34e107e7544""","""Comedy, Sci-Fi, Seinen, Slice …","""宇宙兄弟""","""TV""","""99""","""Apr 1, 2012 to Mar 22, 2014""","""Aniplex, Dentsu, YTV, Trinity …","""Sentai Filmworks""","""A-1 Pictures""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",150428,16552,37234,13009,6948,76685,"""宇宙兄弟"""
"""00427279d72064e7fb69""","""Adventure, Slice of Life, Myst…","""蟲師""","""TV""","""26""","""Oct 23, 2005 to Jun 19, 2006""","""Avex Entertainment, Marvelous,…","""Funimation""","""Artland""","""Manga""","""25 min. per ep.""","""PG-13 - Teens 13 or older""",620736,55482,235371,42786,20017,267080,"""蟲師"""
"""00444b67aaabdf740a68""","""Adventure, Slice of Life, Myst…","""蟲師 続章""","""TV""","""10""","""Apr 5, 2014 to Jun 21, 2014""","""Aniplex, Kodansha, Delfi Sound""","""Aniplex of America""","""Artland""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",226522,12585,113559,6095,2606,91677,"""蟲師"""


In [22]:
processed_anime_df.group_by("original_work_name").len().filter(pl.col("len") > 1).sort("len", descending=True)


original_work_name,len
str,u32
"""ポケットモンスター""",19
"""ドラゴンボール""",17
"""機動戦士ガンダム""",9
"""ハイスクールD×D""",9
"""僕のヒーローアカデミア""",9
"""ソードアート・オンライン""",8
"""食戟のソーマ""",8
"""To LOVEる -とらぶる-""",8
"""劇場版 空の境界 the Garden of sinners…",8
"""進撃の巨人""",7


In [25]:
processed_anime_df.filter(pl.col("original_work_name") == "ドラゴンボール")


anime_id,genres,japanese_name,type,episodes,aired,producers,licensors,studios,source,duration,rating,members,watching,completed,on_hold,dropped,plan_to_watch,original_work_name
str,str,str,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,str
"""183899c73d1a95dafcbc""","""Action, Sci-Fi, Adventure, Com…","""ドラゴンボールZ オラの悟飯をかえせッ!!""","""Movie""","""1""","""Jul 15, 1989""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""41 min.""","""PG-13 - Teens 13 or older""",95872,546,90532,288,237,4269,"""ドラゴンボール"""
"""1d107f1b5bbec18a36ce""","""Action, Adventure, Super Power…","""ドラゴンボールZ 神と神""","""Movie""","""1""","""Mar 30, 2013""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""1 hr. 25 min.""","""PG-13 - Teens 13 or older""",153734,1124,145351,327,311,6621,"""ドラゴンボール"""
"""34a5b46ac7364664ec62""","""Action, Adventure, Comedy, Fan…","""ドラゴンボールZ 超戦士撃破!!勝のはオレだ""","""Movie""","""1""","""Jul 9, 1994""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""50 min.""","""PG-13 - Teens 13 or older""",92592,560,86298,316,305,5113,"""ドラゴンボール"""
"""3a21fb3f57c49b7f9790""","""Action, Adventure, Comedy, Sup…","""ドラゴンボール超（スーパー）""","""TV""","""131""","""Jul 5, 2015 to Mar 25, 2018""","""Yomiko Advertising, Fuji TV""","""Funimation""","""Toei Animation""","""Manga""","""23 min. per ep.""","""PG-13 - Teens 13 or older""",536891,99165,345941,22202,27883,41700,"""ドラゴンボール"""
"""3e1f1b4fe63e9457d20d""","""Action, Adventure, Comedy, Fan…","""ドラゴンボールZ 極限バトル!!三大超サイヤ人""","""Movie""","""1""","""Jul 11, 1992""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""45 min.""","""PG-13 - Teens 13 or older""",93363,513,87540,309,250,4751,"""ドラゴンボール"""
"""4b1ca80941f1130469ae""","""Action, Adventure, Comedy, Fan…","""ドラゴンボールZ""","""TV""","""291""","""Apr 26, 1989 to Jan 31, 1996""","""Fuji TV""","""Funimation""","""Toei Animation""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",888982,32608,772421,19285,23682,40986,"""ドラゴンボール"""
"""5335c0c989bf6754fa44""","""Action, Adventure, Comedy, Sup…","""ドラゴンボール改""","""TV""","""61""","""Apr 6, 2014 to Jun 28, 2015""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""23 min. per ep.""","""PG-13 - Teens 13 or older""",127879,7268,98522,3206,3797,15086,"""ドラゴンボール"""
"""5778343cc74e5a493866""","""Adventure, Comedy, Fantasy, Ma…","""ドラゴンボール""","""TV""","""153""","""Feb 26, 1986 to Apr 12, 1989""","""Fuji TV""","""Funimation""","""Toei Animation""","""Manga""","""24 min. per ep.""","""PG-13 - Teens 13 or older""",735546,37544,600337,27310,23902,46453,"""ドラゴンボール"""
"""5d17b17847a070a6523e""","""Action, Adventure, Fantasy, Sc…","""ドラゴンボールZ 超サイヤ人だ孫悟空""","""Movie""","""1""","""Mar 9, 1991""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""51 min.""","""PG-13 - Teens 13 or older""",91878,520,86174,295,247,4642,"""ドラゴンボール"""
"""73c2f7a54b844be20d25""","""Action, Sci-Fi, Adventure, Com…","""ドラゴンボールZ この世で一番強いヤツ""","""Movie""","""1""","""Mar 10, 1990""","""Unknown""","""Funimation""","""Toei Animation""","""Manga""","""1 hr.""","""PG-13 - Teens 13 or older""",89156,486,83680,283,245,4462,"""ドラゴンボール"""
