## MAL API Setup

In [1]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
#with open('client_id.txt', 'r') as f:
#    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': "ec1c4b7aa5f59e854390b2259667340c"}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    response = requests.get(url, headers=headers, params=params)
    response.raise_for_status()
    return response.json()



## Scrap Anime

In [2]:
import json

def scrape_ranking_page(database, ranking_type, page, fields, save_directory, length):
    params = {'ranking_type': ranking_type, 'limit': 500, 'offset': page*500, 'fields': fields}
    try:
        data = get_data(f'/{database}/ranking', params)
    except:
        data = manga_crash(f'/{database}/ranking', params)
    
    useful = [anime['node'] for anime in data['data']]
    with open(save_directory + f'/page{str(page).zfill(length)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [3]:
import datetime
import tqdm
import time
import os

def scrape_ranking(database='anime', ranking_type='favorite'):

    base_directory = f'data/raw'
    save_file_path = base_directory + f'/{database}_mal.json'
    tmp_directory = base_directory + f'/tmp_{database}_mal'
    os.makedirs(tmp_directory)

    fields = ','.join(keys[database])
    last_page = get_last_page(database, ranking_type)
    length = len(str(last_page))

    start = datetime.datetime.now().strftime("%Y-%m-%d %H.%M.%S")
    print('Scraped at:', start)
    for page in tqdm.trange(last_page+1):
        scrape_ranking_page(database, ranking_type, page, fields, tmp_directory, length)
        time.sleep(1)
    
    merge_anime(tmp_directory, save_file_path)

In [73]:
import math

def get_last_page(database, ranking_type):

    if database == 'anime' and ranking_type == 'favorite':
        number_entries = 24_162
    
    elif database == 'manga' and ranking_type == 'bypopularity':
        number_entries = 59_000
    
    elif database == 'manga' and ranking_type == 'favorite':
        number_entries = 67_000

    last_page = math.ceil(number_entries / 500) - 1

    try:
        params = {'ranking_type': ranking_type, 'limit': 500, 'offset': last_page*500}
        data = get_data(f'/{database}/ranking', params)
        
        if 'next' in data['paging']:
            print(f"Warning: There are more pages available for {database} ranking_type: {ranking_type}, offset: {last_page*500}")
        
        assert len(data['data']) > 0, f"No data found for {database} ranking_type: {ranking_type}, offset: {last_page*500}"
        assert 'next' not in data['paging'], f"Unexpected next page for {database} ranking_type: {ranking_type}, offset: {last_page*500}"
        
    except Exception as e:
        print(f"Error while getting data for {database} ranking_type: {ranking_type}, offset: {last_page*500}")
        print(f"Error message: {str(e)}")
        last_page = -1
        
    return last_page

In [61]:
import math

def get_last_page(database, ranking_type):

    if database=='anime' and ranking_type=='favorite':
        number_entries =  24_162
    
    if database=='manga' and ranking_type=='bypopularity':
        number_entries = 59_950
    
    if database=='manga' and ranking_type=='favorite':
        number_entries = 67_338

    last_page = math.ceil(number_entries / 500) - 1

    # Test that it's still correct
    params = {'ranking_type': ranking_type, 'limit': 500, 'offset': last_page*500}
    data = get_data(f'/{database}/ranking', params)
    assert len(data['data']) > 0
    assert 'next' not in data['paging']

    return last_page

In [62]:
common_keys = [
    'id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
    'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status'
]

anime_keys = [*common_keys, 'num_episodes', 'start_season', 'broadcast', 'source', 'average_episode_duration', 'rating', 'studios']

manga_keys = [*common_keys, 'num_volumes', 'num_chapters', 'authors{id,first_name,last_name}']

keys = {'anime': anime_keys, 'manga': manga_keys}

## Merge Files

In [63]:
import shutil

def merge_anime(tmp_directory, save_file_path):

    data = []
    for file_name in os.listdir(tmp_directory):
        file_path = os.path.join(tmp_directory, file_name)
        with open(file_path, 'r') as f:
            file = json.load(f)
        data.extend(file)

    if os.path.exists(save_file_path):
        os.remove(save_file_path)

    with open(save_file_path, 'w') as f:
        json.dump(data, f, indent=4)

    if os.path.exists(tmp_directory):
        shutil.rmtree(tmp_directory)

In [64]:
known_fails = [116770, 144472, 115838, 143751, 146583, 148716]

def manga_crash(endpoint, params):
    page = params["offset"]//params["limit"]
    print(f'Crashed at page {page}')
    
    params['fields'] = params['fields'].replace('alternative_titles,', '')
    data = get_data(endpoint, params)

    ids = [manga['node']['id'] for manga in data['data']]

    present_fails = [id for id in ids if id in known_fails]

    if not present_fails:
        print('Fail unknown...')
        return data
    
    print('Fails:', present_fails)

    offset = page * params['limit']
    problems = [offset-1]
    for fail in known_fails:
        if fail in ids:
            problems.append(offset + ids.index(fail))
    problems.append(offset + params['limit'])

    alternative_titles = []
    params['fields'] = 'alternative_titles'
    for i in range(len(problems)-1):
        params['offset'] = problems[i] + 1
        params['limit'] = problems[i+1] - problems[i] - 1
        data_short = get_data(endpoint, params)
        alternative_titles.extend((manga['node']['id'], manga['node']['alternative_titles']) for manga in data_short['data'])
        time.sleep(1)

    for id, alt_tit in alternative_titles:
        data['data'][ids.index(id)]['node']['alternative_titles'] = alt_tit
    
    return data

In [74]:
scrape_ranking('anime', 'favorite')


Scraped at: 2023-04-23 20.25.54


 43%|████▎     | 21/49 [00:55<01:13,  2.63s/it]

Crashed at page 21
Fail unknown...


 47%|████▋     | 23/49 [01:01<01:12,  2.81s/it]

In [66]:
scrape_ranking('manga', 'bypopularity')

AssertionError: 

In [68]:
scrape_ranking('manga', 'favorite')

AssertionError: 

## Anime Cleaning

In [11]:

import pandas as pd
import numpy as np

anime = pd.read_json('data/raw/anime_mal.json')

# Usually no Duplicates, but can happen (it even happens in the website)
old_size = anime.shape[0]
anime = anime.drop_duplicates(subset=['id']).reset_index(drop=True)
number_duplicates = old_size - anime.shape[0]
if number_duplicates:
    print('Duplicates:', number_duplicates)

# Shorter and better names, like the website
anime.rename(columns={'id': 'anime_id', 'media_type': 'type', 'mean': 'score', 'num_list_users': 'members', 'num_scoring_users': 'scored_by', \
    'num_favorites': 'favorites', 'average_episode_duration': 'episode_duration', 'num_episodes': 'episodes'}, inplace=True)

# Avoid 'Unknown' string
anime['type'] = anime['type'].replace('unknown', np.nan)

# Avoid false zeroes and unnecessary floats
anime['episodes'] = anime['episodes'].replace(0, np.nan).astype('Int64')

# Without adding False day 1 or False month January (i.e 2005 -> 2005-1-1)
anime['real_start_date'] = anime['start_date']
anime['real_end_date'] = anime['end_date']

# Use Timestamps
anime['start_date'] = pd.to_datetime(anime['start_date'])
anime['end_date'] = pd.to_datetime(anime['end_date'])

# Use Timedelta
anime['episode_duration'] = pd.to_timedelta(anime['episode_duration'].replace(0, np.nan), unit='s')
anime['total_duration'] = anime.apply(lambda x: x['episode_duration'] * x['episodes'] if not pd.isna(x['episodes']) else np.nan, axis=1)

# Use popularity=0 to detect 'pending approval' animes
anime['approved'] = anime['popularity'] != 0

#  Drop rank and popularity, as they sort equal score / members alphabetically...
anime.drop(columns=['rank', 'popularity'], inplace=True)

# MyAnimeList edits
anime['created_at'] = pd.to_datetime(anime['created_at'])
anime['updated_at'] = pd.to_datetime(anime['updated_at'])

# Normalize start season
anime['start_year'] = anime['start_season'].str['year'].astype('Int64')
anime['start_season'] = anime['start_season'].str['season']

# Avoid empty synopsis
old_default_synopsis = 'No synopsis has been added for this series yet.\n\nClick here to update this information.'
anime['synopsis'] = anime['synopsis'].replace('', np.nan).replace(old_default_synopsis, np.nan)

# Simplify main picture
anime['main_picture'] = anime['main_picture'].str['large'].str.replace('api-', '')

# Normalize broadcast
anime['broadcast_day'] = anime['broadcast'].str['day_of_the_week']
anime['broadcast_time'] = pd.to_datetime(anime['broadcast'].str['start_time']).dt.time
anime.drop(columns=['broadcast'], inplace=True)

# Only keep names
anime['genres'] = anime['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
anime['studios'] = anime['studios'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

genres = {'Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love',  'Comedy', 'Drama', 'Ecchi', 'Erotica', 'Fantasy',
'Girls Love', 'Gourmet', 'Hentai', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports', 'Supernatural', 'Suspense'}

themes = {'Adult Cast', 'Anthropomorphic', 'CGDCT', 'Childcare', 'Combat Sports', 'Crossdressing', 'Delinquents', 'Detective', 'Educational',
'Gag Humor', 'Gore', 'Harem', 'High Stakes Game', 'Historical', 'Idols (Female)', 'Idols (Male)', 'Isekai', 'Iyashikei', 'Love Polygon',
'Magical Sex Shift', 'Mahou Shoujo', 'Martial Arts', 'Mecha', 'Medical', 'Military', 'Music', 'Mythology', 'Organized Crime', 'Otaku Culture',
'Parody', 'Performing Arts', 'Pets', 'Psychological', 'Racing', 'Reincarnation', 'Reverse Harem', 'Romantic Subtext', 'Samurai', 'School',
'Showbiz', 'Space', 'Strategy Game', 'Super Power', 'Survival', 'Team Sports', 'Time Travel', 'Vampire', 'Video Game', 'Visual Arts', 'Workplace'}

demographics = {'Josei', 'Kids', 'Seinen', 'Shoujo', 'Shounen'}

# Split genres, themes and demographics
anime['themes'] = anime['genres'].apply(lambda x: [t for t in x if t in themes])
anime['demographics'] = anime['genres'].apply(lambda x: [t for t in x if t in demographics])
anime['genres'] = anime['genres'].apply(lambda x: [t for t in x if t in genres])

# Mark R18+ Titles (not ranked)
anime['sfw'] = anime['genres'].apply(lambda x: 'Hentai' not in x and 'Erotica' not in x)

# nsfw is much more restrictive. But on 2022-9-22 it was deprecated and it's not used anymore. It has a lot of false positives, and is no
# longer updated, so the new definition is simply better, nudity it's already marked with r+. Only rember to mark it when requesting lists
anime.drop(columns=['nsfw'], inplace=True)

# Alternative titles
anime['title_english'] = anime['alternative_titles'].str['en'].replace('', np.nan)
anime['title_japanese'] = anime['alternative_titles'].str['ja'].replace('', np.nan)
anime['title_synonyms'] = anime['alternative_titles'].str['synonyms']
anime.drop(columns=['alternative_titles'], inplace=True)

# Avoid double spaces, which don't appear on the website
for col in ['title', 'title_english', 'title_japanese']:
    anime[col] = anime[col].str.replace('  ', ' ')
anime['title_synonyms'] = anime['title_synonyms'].apply(lambda x: [t.replace('  ', ' ') for t in x])

# Better order
order = ['anime_id', 'title', 'type', 'score', 'scored_by', 'status', 'episodes', 'start_date', 'end_date', 'source',
        'members', 'favorites', 'episode_duration', 'total_duration', 'rating', 'sfw', 'approved', 'created_at', 'updated_at',
        'start_year', 'start_season', 'real_start_date', 'real_end_date', 'broadcast_day', 'broadcast_time',
        'genres', 'themes', 'demographics', 'studios', 'synopsis', 'main_picture', 'title_english', 'title_japanese', 'title_synonyms']

deleted = ['rank', 'popularity', 'nsfw']

missing = ['producers', 'licensors', 'background', 'url', 'trailer_url']

anime = anime[order]

# Sort by Top Anime
anime['tmp'] = anime['score'].rank(ascending=False) + anime['scored_by'].rank(ascending=False)
anime = anime.sort_values(['tmp', 'members', 'favorites', 'anime_id'], \
    ascending=[True, False, False, True]).reset_index(drop=True)
anime.drop(columns=['tmp'], inplace=True)

# Save to csv
anime.to_csv('data/anime_mal.csv', index=False)

print(anime.shape)

pd.options.display.max_columns = None
anime.head(1)

  base = data.astype(np.int64)
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")


(24447, 34)


Unnamed: 0,anime_id,title,type,score,scored_by,status,episodes,start_date,end_date,source,members,favorites,episode_duration,total_duration,rating,sfw,approved,created_at,updated_at,start_year,start_season,real_start_date,real_end_date,broadcast_day,broadcast_time,genres,themes,demographics,studios,synopsis,main_picture,title_english,title_japanese,title_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,tv,9.1,1997132,finished_airing,64,2009-04-05,2010-07-04,manga,3140335,215733,0 days 00:24:20,1 days 01:57:20,r,True,True,2008-08-21 03:35:22+00:00,2023-04-02 18:07:03+00:00,2009,spring,2009-04-05,2010-07-04,sunday,17:00:00,"[Action, Adventure, Drama, Fantasy]",[Military],[Shounen],[Bones],After a horrific alchemy experiment goes wrong...,https://cdn.myanimelist.net/images/anime/1208/...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...


### Load Anime

In [14]:
import pandas as pd
import ast

anime = pd.read_csv('data/anime_mal.csv')

for col in ['start_date', 'end_date', 'created_at', 'updated_at']:
    anime[col] = pd.to_datetime(anime[col])

for col in ['episodes', 'start_year']:
    anime[col] = anime[col].astype('Int64')

for col in ['genres', 'themes', 'demographics', 'studios', 'title_synonyms']:
    anime[col] = anime[col].apply(ast.literal_eval)

anime['broadcast_time'] = pd.to_datetime(anime['broadcast_time']).dt.time

anime['episode_duration'] = pd.to_timedelta(anime['episode_duration'])

pd.options.display.max_columns = None
anime.sample(30)


Unnamed: 0,anime_id,title,type,score,scored_by,status,episodes,start_date,end_date,source,members,favorites,episode_duration,total_duration,rating,sfw,approved,created_at,updated_at,start_year,start_season,real_start_date,real_end_date,broadcast_day,broadcast_time,genres,themes,demographics,studios,synopsis,main_picture,title_english,title_japanese,title_synonyms
16427,26135,Afghanistan Paghman-mura no Monogatari: Sekaii...,ova,,61,finished_airing,1,2002-01-01,2002-01-01,,461,1,0 days 00:16:00,0 days 00:16:00,g,True,True,2014-08-24 08:15:23+00:00,2022-04-02 02:38:31+00:00,,,2002,2002,,NaT,[],[Historical],[Kids],[Toei Animation],"An educational film about Paghman, Afghanistan...",https://cdn.myanimelist.net/images/anime/4/660...,,アフガニスタン・パグマン村の物語 せかいいち　うつくしい　ぼくの村,"[Story of Afghanistan Paghman Village, Afghani..."
322,32188,Steins;Gate: Kyoukaimenjou no Missing Link - D...,special,8.25,136800,finished_airing,1,2015-12-03,2015-12-03,visual_novel,245753,391,0 days 00:24:00,0 days 00:24:00,pg_13,True,True,2015-12-02 02:33:02+00:00,2023-01-18 03:56:17+00:00,2015.0,fall,2015-12-03,2015-12-03,,NaT,"[Sci-Fi, Suspense]",[Time Travel],[],[White Fox],"Having reached his emotional breaking point, R...",https://cdn.myanimelist.net/images/anime/7/773...,Steins;Gate: Open the Missing Link - Divide By...,シュタインズ・ゲート境界面上のミッシングリンク-Divide By Zero-,"[Steins Gate: Episode 23 (β), Open the Missing..."
6481,28835,Tesagure! Bukatsumono Spin-off Purupurun Sharu...,tv,6.89,1338,finished_airing,12,2015-04-05,2015-06-28,original,4097,15,0 days 00:23:00,0 days 04:36:00,pg_13,True,True,2014-12-13 22:12:38+00:00,2021-12-17 23:16:05+00:00,2015.0,spring,2015-04-05,2015-06-28,sunday,02:20:00,[Slice of Life],"[Parody, School]",[],[Yaoyorozu],A spin-off of the Tesagure! Bukatsumono series.,https://cdn.myanimelist.net/images/anime/9/722...,,てさぐれ！部活もの すぴんおふ プルプルんシャルムと遊ぼう,[Tesagure! Bukatsu-mono Spin-off Purupurun Sha...
17799,39906,Megido 72: Nagaki Sen Tabi no Katawara de,ona,,40,finished_airing,5,2019-07-27,2019-08-23,game,265,0,0 days 00:04:03,0 days 00:20:15,pg_13,True,True,2019-06-03 11:16:19+00:00,2020-03-12 21:04:12+00:00,2019.0,summer,2019-07-27,2019-08-23,,NaT,"[Action, Adventure, Fantasy]",[],[],[V-sign],,https://cdn.myanimelist.net/images/anime/1519/...,,メギド７２ 長き戦旅の傍らで,[]
2014,50203,Love Live! Superstar!! 2nd Season,tv,7.74,11449,finished_airing,12,2022-07-17,2022-10-09,other,28758,319,0 days 00:23:45,0 days 04:45:00,pg_13,True,True,2021-10-24 07:14:04+00:00,2022-09-15 16:31:58+00:00,2022.0,summer,2022-07-17,2022-10-09,sunday,19:00:00,[Slice of Life],"[Idols (Female), Music, School]",[],[Sunrise],As another year begins at Yuigaoka Girls' High...,https://cdn.myanimelist.net/images/anime/1238/...,,ラブライブ！スーパースター!!,[]
11888,41033,Umiyuri Kaiteitan,music,6.12,192,finished_airing,1,2017-04-21,2017-04-21,original,364,1,0 days 00:03:58,0 days 00:03:58,pg_13,True,True,2020-01-10 09:32:01+00:00,2022-03-17 06:14:12+00:00,2017.0,spring,2017-04-21,2017-04-21,,NaT,[],[Music],[],[],,https://cdn.myanimelist.net/images/anime/1105/...,Tale of the Deep-sea Lily,ウミユリ海底譚,"[Sea Lily Deep Sea Tale, Deep Sea Lily Tale]"
8079,4443,Duel Masters Charge,tv,6.16,2520,finished_airing,52,2004-04-19,2006-03-27,original,5214,3,0 days 00:21:00,0 days 18:12:00,pg_13,True,True,2008-05-01 13:35:11+00:00,2021-05-16 22:28:34+00:00,2004.0,spring,2004-04-19,2006-03-27,,NaT,"[Action, Adventure, Comedy]",[],[Kids],[Studio Hibari],Shobu has returned after his 3 years training ...,https://cdn.myanimelist.net/images/anime/1914/...,,デュエル・マスターズ　チャージ,[]
23084,46039,Leyuan Shuang Bao,tv,,4,finished_airing,26,2005-01-01,NaT,,42,0,0 days 00:12:20,0 days 05:20:40,pg,True,True,2021-01-13 08:35:42+00:00,2022-09-05 22:03:14+00:00,,,2005,,,NaT,[Fantasy],[Anthropomorphic],[Kids],[],,https://cdn.myanimelist.net/images/anime/1251/...,,乐园双宝,[Skipper & Skeeto]
4634,17821,Stella Jogakuin Koutou-ka C³-bu,tv,6.53,21172,finished_airing,13,2013-07-05,2013-09-27,manga,53307,75,0 days 00:24:00,0 days 05:12:00,pg_13,True,True,2013-03-14 03:58:09+00:00,2022-06-15 19:59:09+00:00,2013.0,summer,2013-07-05,2013-09-27,,NaT,[Sports],"[Military, School]",[],[Gainax],"Command, control, and communication: the three...",https://cdn.myanimelist.net/images/anime/13/52...,"Stella Women's Academy, High School Division C...",ステラ女学院高等科C³部（しーきゅーぶ）,"[Stella Jogakuin Koutouka C3-bu, Stella Jogaku..."
3985,21033,Seikoku no Dragonar,tv,6.43,105088,finished_airing,12,2014-04-05,2014-06-21,light_novel,214315,277,0 days 00:24:00,0 days 04:48:00,r+,True,True,2013-10-26 20:42:59+00:00,2022-04-16 15:31:05+00:00,2014.0,spring,2014-04-05,2014-06-21,,NaT,"[Comedy, Ecchi, Fantasy]",[School],[],[C-Station],Learning to ride and tame dragons comes easy t...,https://cdn.myanimelist.net/images/anime/13/56...,Dragonar Academy,星刻の竜騎士,[Seikoku no Ryuukishi]


In [15]:
anime.shape

(24447, 34)

## Manga Cleaning

In [None]:
import pandas as pd
import numpy as np
import datetime

manga = pd.read_json('data/raw/manga_mal.json')

# Usually no Duplicates, but can happen (it even happens in the website)
# ---------------------- BUT HERE THEY ARE REAL LOSSES!!!!!!!! ---------------------------------
old_size = manga.shape[0]
manga = manga.drop_duplicates(subset=['id']).reset_index(drop=True)
number_duplicates = old_size - manga.shape[0]
if number_duplicates:
    print('Duplicates:', number_duplicates)

# Shorter and better names, like Jikan API
manga.rename(columns={'id': 'manga_id', 'media_type': 'type', 'mean': 'score', 'num_list_users': 'members', 'num_scoring_users': 'scored_by', \
    'num_favorites': 'favorites', 'num_volumes': 'volumes', 'num_chapters': 'chapters'}, inplace=True)

# Avoid false zeroes and unnecessary floats 
manga['volumes'] = manga['volumes'].replace(0, np.nan).astype('Int64')
manga['chapters'] = manga['chapters'].replace(0, np.nan).astype('Int64')

# Without adding False day 1 or False month January (i.e 2005 -> 2005-1-1)
manga['real_start_date'] = manga['start_date']
manga['real_end_date'] = manga['end_date']

# Use Timestamps
manga['start_date'] = pd.to_datetime(manga['start_date'])
manga['end_date'] = pd.to_datetime(manga['end_date'])

# Use popularity=0 to detect 'pending approval' mangas
manga['approved'] = manga['popularity'] != 0

# Only keep names
manga['genres'] = manga['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])

genres = {'Action', 'Adventure', 'Avant Garde', 'Award Winning', 'Boys Love',  'Comedy', 'Drama', 'Ecchi', 'Erotica', 'Fantasy',
'Girls Love', 'Gourmet', 'Hentai', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Slice of Life', 'Sports', 'Supernatural', 'Suspense'}

themes = {'Adult Cast', 'Anthropomorphic', 'CGDCT', 'Childcare', 'Combat Sports', 'Crossdressing', 'Delinquents', 'Detective', 'Educational',
'Gag Humor', 'Gore', 'Harem', 'High Stakes Game', 'Historical', 'Idols (Female)', 'Idols (Male)', 'Isekai', 'Iyashikei', 'Love Polygon',
'Magical Sex Shift', 'Mahou Shoujo', 'Martial Arts', 'Mecha', 'Medical', 'Military', 'Music', 'Mythology', 'Organized Crime', 'Otaku Culture',
'Parody', 'Performing Arts', 'Pets', 'Psychological', 'Racing', 'Reincarnation', 'Reverse Harem', 'Romantic Subtext', 'Samurai', 'School',
'Showbiz', 'Space', 'Strategy Game', 'Super Power', 'Survival', 'Team Sports', 'Time Travel', 'Vampire', 'Video Game', 'Visual Arts',
'Workplace'} | {'Memoir', 'Villainess'}

demographics = {'Josei', 'Kids', 'Seinen', 'Shoujo', 'Shounen'}

# Split genres, themes and demographics
manga['themes'] = manga['genres'].apply(lambda x: [t for t in x if t in themes])
manga['demographics'] = manga['genres'].apply(lambda x: [t for t in x if t in demographics])
manga['genres'] = manga['genres'].apply(lambda x: [t for t in x if t in genres])

# Authors
def author_format(authors):
    if authors is np.nan:
        return []
    output = []
    for author in authors:
        output.append({'id': author['node']['id'], 'first_name': author['node']['first_name'], 'last_name': author['node']['last_name'], \
            'role': author['role']})
    return output
manga['authors']  = manga['authors'].apply(author_format)

# Mark R18+ Titles (not ranked)
manga['sfw'] = manga['genres'].apply(lambda x: 'Hentai' not in x and 'Erotica' not in x)

# Similar to the anime version, a lot of wrong labeled
manga.drop(columns=['nsfw'], inplace=True)

# MyAnimeList edits
for col in ['created_at', 'updated_at']:
    manga[col] = pd.to_datetime(manga[col])
    manga.loc[manga[col]=='1970-01-01 00:00:00+0000', col] = pd.NaT

# Looks like created_at it's not working??
assert all(manga['created_at'].isna())
manga.drop(columns=['created_at'], inplace=True)

# Make it manually
m = manga[manga['updated_at'].notna()].sort_values('updated_at')[['manga_id', 'updated_at']]
data = [m.iloc[0]]
for _, row in m.iterrows():
    if row['manga_id'] > data[-1]['manga_id']:
        data.append(row)
data.append({'manga_id': 2**63-1, 'updated_at': datetime.datetime.utcnow()})

created_at = []
manga.sort_values('manga_id', inplace=True)
pos = 0
for id in manga.manga_id:
    if id > data[pos]['manga_id']:
        pos += 1
    created_at.append(data[pos]['updated_at'])

manga['created_at_before'] = pd.to_datetime(created_at, utc=True)

# Avoid empty string
manga.loc[manga['synopsis'].isin(['', ' ', 'N/A', 'n/a']), 'synopsis'] = np.nan

# Simplify main picture
manga['main_picture'] = manga['main_picture'].str['large'].str.replace('api-', '')

# Normalize alternative titles
manga['title_english'] = manga['alternative_titles'].str['en'].replace('', np.nan)
manga['title_japanese'] = manga['alternative_titles'].str['ja'].replace('', np.nan)
manga['title_synonyms'] = manga['alternative_titles'].str['synonyms'].fillna('').apply(list)
manga.drop(columns=['alternative_titles'], inplace=True)

# Clean some string errors
for col in ['title', 'title_english', 'title_japanese']:
    manga[col] = manga[col].str.strip().str.replace('  ', ' ')
manga['title_synonyms'] = manga['title_synonyms'].apply(lambda x: [t.replace('  ', ' ') for t in x])

# Better order
order = ['manga_id', 'title', 'type', 'score', 'scored_by', 'status', 'volumes', 'chapters', 'start_date', 'end_date',
         'members', 'favorites', 'sfw', 'approved', 'created_at_before', 'updated_at', 'real_start_date', 'real_end_date',
         'genres', 'themes', 'demographics', 'authors', 'synopsis', 'main_picture', 'title_english', 'title_japanese', 'title_synonyms']

deleted = ['rank', 'popularity', 'nsfw']

missing = ['background', 'serializations', 'url']

manga = manga[order]

# Sort by Top Manga
manga['tmp'] = manga['score'].rank(ascending=False) + manga['scored_by'].rank(ascending=False)
manga = manga.sort_values(['tmp', 'members', 'favorites', 'manga_id'], \
    ascending=[True, False, False, True]).reset_index(drop=True)
manga.drop(columns=['tmp'], inplace=True)

# Save to csv
manga.to_csv('data/manga_mal.csv', index=False)

print(manga.shape)

pd.options.display.max_columns = None
manga.head(1)

### Load Manga

In [None]:
import pandas as pd
import ast

manga = pd.read_csv('data/manga_mal.csv')

for col in ['start_date', 'end_date', 'created_at_before', 'updated_at']:
    manga[col] = pd.to_datetime(manga[col])

for col in ['volumes', 'chapters']:
    manga[col] = manga[col].astype('Int64')

for col in ['genres', 'themes', 'demographics', 'authors', 'title_synonyms']:
    manga[col] = manga[col].apply(ast.literal_eval)

pd.options.display.max_columns = None
manga.head(1)

In [None]:
# Authors
def author_format(authors):
    if authors is np.nan:
        return []
    output = []
    for author in authors:
        if not author['node']['first_name']:
            output.append(f"{author['node']['last_name']} ({author['role']}")
        else:
            output.append(f"{author['node']['last_name']}, {author['node']['first_name']} ({author['role']})")
    return output
manga['authors']  = manga['authors'].apply(author_format)