## Artist objects with same name or first name/last name inversed

In [1]:
from collections import Counter

c = Counter()
for name in Artist.objects.values_list('name', flat=True):
    c[' '.join(sorted(name.lower().split()))] += 1
meta = Counter(c.values())
for k, v in meta.most_common():
    print(k, 'occurrence :', v, 'artiste(s)')
print()
for k, v in c.most_common():
    if v >= 3:
        print(k, ':', v, 'occurrences')

1 occurrence : 2416 artiste(s)
2 occurrence : 48 artiste(s)
3 occurrence : 7 artiste(s)
4 occurrence : 1 artiste(s)

harold sakuishi : 4 occurrences
araki hirohiko : 3 occurrences
kawahara reki : 3 occurrences
otonaka sawaki : 3 occurrences
akira toriyama : 3 occurrences
miki yoshikawa : 3 occurrences
sukeno yoshiaki : 3 occurrences
ai yazawa : 3 occurrences


## How many works share a title

In [2]:
c = Counter(Work.objects.values_list('title', 'category__slug'))
for k, v in c.most_common():
    if v >= 2:
        print(k, v)

('The Asterisk War: The Academy City on the Water', 'anime') 9
('Zatch Bell!', 'anime') 4
('Sorcerer Hunters', 'anime') 3
('Those Who Hunt Elves', 'anime') 3
('Yu-Gi-Oh!', 'anime') 3
('Desert Punk', 'anime') 3
('Dimension W', 'anime') 3
('Dororo', 'anime') 2
('Gestalt', 'anime') 2
('Pumpkin Scissors', 'anime') 2
('Yozakura Quartet', 'anime') 2
('Oh! My Goddess', 'anime') 2
('The Seven Deadly Sins', 'anime') 2
("Ghost Talker's Daydream", 'anime') 2
('Guin Saga', 'anime') 2
('Digimon: The Movie', 'anime') 2
('Samurai Deeper Kyo', 'anime') 2
('Hero Tales', 'anime') 2
('Samurai Champloo', 'anime') 2
('Lunar Legend Tsukihime', 'anime') 2
('Akikan!', 'manga') 2
('Berserk', 'anime') 2
('Zipang', 'anime') 2
('Sailor Moon', 'manga') 2
('This Ugly Yet Beautiful World', 'anime') 2
('Scrapped Princess', 'anime') 2
('Oh! My Goddess', 'manga') 2
('Seraph of the End: Vampire Reign', 'anime') 2
('Togainu no Chi', 'anime') 2
('Twin Star Exorcists', 'manga') 2
('Bartender', 'manga') 2
('School-Live!', '

### What about alternative titles?

In [3]:
nb = 0
alt_titles = dict(WorkTitle.objects.values_list('title', 'work_id'))
pairs = []
for work_id, title in Work.objects.filter(title__in=alt_titles.keys()).values_list('id', 'title'):
    if work_id != alt_titles[title]:
        pairs.append((work_id, alt_titles[title]))

In [4]:
import numpy as np

works = Work.objects.in_bulk(list(np.array(pairs).flatten()))

In [5]:
for work_id, work_id2 in pairs:
    if works[work_id].title != works[work_id2].title:
        print(work_id, works[work_id].title, '/', works[work_id2].title, work_id2)

1683 Sakura Diaries / Sakura Tsuushin 20925
3048 Accel World Specials / Accel World: Acchel World. 15820
9864 The Heroic Legend of Arslan / Arslan Senki 20830
4265 Anedoki / Ane Doki 15702
10010 Sound! Euphonium / Hibike! Euphonium: Kitauji Koukou Suisougaku-bu e Youkoso 20057
10598 Sekirei Special / Sekirei: Hajimete no Otsukai 15654
6595 Nausicaa / Nausicaä de la vallée du vent 1289
6193 Love & Collage / Ai Kora 15700
128 My Teen Romantic Comedy SNAFU / My Youth Romantic Comedy Is Wrong, As I Expected 18554
2036 Plastic Elder Sister / Plastic Neesan 15951
9 Elfen Lied / Lynn Okamoto: Short Story Collection 15729
2022 Nekomonogatari Black: Tsubasa Family / Nekomonogatari Black 14666
11362 Angel Links / Seihou Tenshi Angel Links 21059
788 Sheep's Song / Hitsuji no Uta 15749
15090 Zatch Bell! / Zatchbell 8030
9953 La croisée dans un labyrinthe étranger Special / Croisée in a Foreign Labyrinth The Animation 16017
10012 Punchline / Punch Line 15555
289 Linebarrels of Iron / Kurogane no Li

## How many works share a reference?

In [6]:
from django.db.models import Count

def describe(queryset):
    nb_distinct_fields = queryset.filter(nb__gte=2).count()
    nb_duplicates = sum(queryset.filter(nb__gte=2).values_list('nb', flat=True))
    print('{:d} oeuvres partagent {:d} champs'.format(nb_duplicates, nb_distinct_fields))

In [7]:
queryset = Work.objects.values('reference__url').annotate(nb=Count('reference__url')).order_by('-nb')
describe(queryset)

577 oeuvres partagent 282 champs


Donc en septembre 2018, 577 œuvres partagent 282 références. Peut-être encore des doublons à écarter.

## How many works have same poster?

In [8]:
queryset = Work.objects.filter(ext_poster__endswith='.jpg').values('ext_poster').annotate(nb=Count('ext_poster')).order_by('-nb')
queryset[:5]

<WorkQuerySet [{'ext_poster': 'https://myanimelist.cdn-dena.com/images/anime/11/79107.jpg', 'nb': 2}, {'ext_poster': 'http://www.manga-news.com/public/images/series/adieu_midori.jpg', 'nb': 2}, {'ext_poster': 'http://www.manga-news.com/public/images/series/dragonhuntercoffret1gd.jpg', 'nb': 2}, {'ext_poster': 'http://www.manga-news.com/public/images/series/lodoss_coffret.jpg', 'nb': 2}, {'ext_poster': 'http://www.manga-news.com/public/images/series/fleur_coffret.jpg', 'nb': 2}]>

In [9]:
describe(queryset)

58 oeuvres partagent 29 champs


## How many works have same AniDB ID?

In [10]:
queryset = Work.objects.exclude(anidb_aid=0).values('anidb_aid').annotate(nb=Count('anidb_aid')).order_by('-nb')
queryset[:5]

<WorkQuerySet [{'anidb_aid': 4932, 'nb': 6}, {'anidb_aid': 8778, 'nb': 3}, {'anidb_aid': 4897, 'nb': 2}, {'anidb_aid': 9977, 'nb': 2}, {'anidb_aid': 6784, 'nb': 2}]>

In [11]:
describe(queryset)

35 oeuvres partagent 15 champs


In [12]:
Work.objects.filter(anidb_aid=4932)  # Lol ce n'est pas un doublon

<WorkQuerySet [<Work: Kara no Kyoukai 1: Fukan Fuukei>, <Work: Kara no Kyoukai 3: Tsuukaku Zanryuu>, <Work: Kara no Kyoukai 5: Mujun Rasen>, <Work: Kara no Kyoukai 2: Satsujin Kousatsu (Part 1)>, <Work: Kara no Kyoukai 4: Garan no Dou>, <Work: Kara no Kyoukai 7: Satsujin Kousatsu (Part 2)>]>

In [13]:
Work.objects.filter(anidb_aid=8778)  # Non plus

<WorkQuerySet [<Work: Puella Magi Madoka Magica the Movie Part 3: Rebellion>, <Work: Puella Magi Madoka Magica the Movie Part 1: Beginnings>, <Work: Puella Magi Madoka Magica the Movie Part 2: Eternal>]>

## How many anime have an AniDB ID?

In [14]:
nb_anidb_works = Work.objects.exclude(anidb_aid=0).count()
nb_anidb_works

305

In [15]:
nb_anidb_works / Work.objects.filter(category__slug='anime').count()

0.027767662053896577

## How many works have at least one Reference?

In [16]:
Work.objects.annotate(nb=Count('reference')).values('id').filter(nb__gte=1).count() / Work.objects.count()

0.7519369834710744

In [17]:
for category in Category.objects.all():
    print(category.slug)
    category_works = Work.objects.filter(category=category)
    nb = category_works.annotate(nb=Count('reference')).values('id').filter(nb__gte=1).count()
    print('{:d}/{:d} = {:.1f} %'.format(nb, category_works.count(), 100 * nb / category_works.count()))

anime
10969/10984 = 99.9 %
manga
677/4503 = 15.0 %
album
0/1 = 0.0 %
