## MAL API Setup

In [1]:
import requests

api_url = 'https://api.myanimelist.net/v2'

# A Client ID is needed (https://myanimelist.net/apiconfig)
#with open('client_id.txt', 'r') as f:
#    CLIENT_ID = f.read()

headers = {'X-MAL-CLIENT-ID': "(Put your client ID here)"}

def get_data(endpoint, params=None):
    url = api_url + endpoint
    if params:
        url += '?' + '&'.join(f'{key}={value}' for key, value in params.items())
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    return response.json()


## Scrap Anime

In [22]:
import json
import os

scraping_save_pages = '../My Anime List Scrapping/data/data_tmp/anime_pages'

if not os.path.exists(scraping_save_pages): # Create saving directory if it doesn't exist
  os.makedirs(scraping_save_pages)

endpoint = f'/anime/ranking'
limit = 500

anime_keys = ['id', 'title', 'main_picture', 'alternative_titles', 'start_date', 'end_date', 'synopsis', 'mean', 'rank', 'popularity',
              'num_list_users', 'num_scoring_users', 'num_favorites', 'nsfw', 'genres', 'created_at', 'updated_at', 'media_type', 'status',
              'num_episodes', 'start_season', 'broadcast', 'source', 'average_episode_duration', 'rating','studios','recommendation''node']

def scrape_page(page):
    params = {'ranking_type': 'bypopularity', 'limit': limit, 'offset': page*limit, 'fields': ','.join(anime_keys)}
    data = get_data(endpoint, params)
    useful = [anime['node'] for anime in data['data']]
    with open(scraping_save_pages + f'/page{str(page).zfill(2)}.json', 'w') as f:
        json.dump(useful, f, indent=4)

In [23]:
import math

# 12 July 2022
previous_total_anime = 20_741
previous_last_page = math.ceil(previous_total_anime / limit) - 1

data = get_data(endpoint, {'ranking_type': 'bypopularity', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(anime_keys)})
#data = get_data(endpoint,{'ranking_type': 'bypopularity', 'limit': limit, 'offset': previous_last_page*limit, 'fields': ','.join(str(v) for v in 'recommendations')})
assert len(data['data']) > 0
assert 'next' not in data['paging']


last_page = previous_last_page

last_page

41

In [24]:
import tqdm
import time

for page in tqdm.trange(last_page+1):
    scrape_page(page)
    time.sleep(1)

100%|██████████| 42/42 [02:03<00:00,  2.95s/it]


## Merge Files

In [15]:
import os

scraping_save_pages = 'data/data_tmp/anime_pages'

data = []
for file_name in os.listdir(scraping_save_pages):
    file_path = os.path.join(scraping_save_pages, file_name)
    with open(file_path, 'r') as f:
        file = json.load(f)
    data.extend(file)

len(data),type(data)

(23861, list)

In [17]:
len(data),len(data[1])
data[1]

{'id': 1535,
 'title': 'Death Note',
 'main_picture': {'medium': 'https://api-cdn.myanimelist.net/images/anime/9/9453.jpg',
  'large': 'https://api-cdn.myanimelist.net/images/anime/9/9453l.jpg'}}

In [9]:
with open('data/data_tmp/anime_raw.json', 'w') as f:
    json.dump(data, f, indent=4)

## Anime Cleaning

In [10]:

import pandas as pd
import numpy as np
import json

with open('data/data_tmp/anime_raw.json', 'r') as f:
    data = json.load(f)

anime = pd.json_normalize(data, sep='_')

# Use Timestamps
anime['start_date'] = pd.to_datetime(anime['start_date'])
anime['end_date'] = pd.to_datetime(anime['end_date'])

# Avoid floats and zeroes marking nsfw
anime['num_episodes'] = anime['num_episodes'].replace(0, np.nan).astype('Int64')
anime['popularity'] = anime['popularity'].replace(0, np.nan).astype('Int64')
anime['rank'] = anime['rank'].replace(0, np.nan).astype('Int64')
anime['mean'] = anime['mean'].replace(0, np.nan).astype('float64')
anime['num_favorites'] = anime['num_favorites'].replace(0, np.nan).astype('Int64')

# Use Timedelta
anime['average_episode_duration'] = pd.to_timedelta(anime['average_episode_duration'].replace(0, np.nan), unit='s')

# Avoid floats, as time
anime['start_season_year'] = anime['start_season_year'].astype('Int64')
anime['broadcast_start_time'] = pd.to_datetime(anime['broadcast_start_time']).dt.time

# Only keep names
anime['genres'] = anime['genres'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
anime['recommendations'] = anime['recommendations'].apply(lambda x: [dic['title'] for dic in x] if not x is np.nan else [])
anime['studios'] = anime['studios'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
anime['recommendations'] = anime['recommendations'].apply(lambda x: [dic['name'] for dic in x] if not x is np.nan else [])
# MyAnimeList edits
anime['created_at'] = pd.to_datetime(anime['created_at']).dt.tz_convert(None)
anime['updated_at'] = pd.to_datetime(anime['updated_at']).dt.tz_convert(None)

# Avoid empty string
anime['synopsis'] = anime['synopsis'].replace('', np.nan)
anime['recommendations'] = anime['recommendations'].replace('', np.nan)
anime['alternative_titles_en'] = anime['alternative_titles_en'].replace('', np.nan)
anime['alternative_titles_ja'] = anime['alternative_titles_ja'].replace('', np.nan)
                

order = ['id', 'title', 'media_type', 'mean', 'num_scoring_users',                          # 10 Most important attributes, 
        'status', 'num_episodes', 'start_date', 'end_date', 'source',                      # appearing first on kaggle

        'num_list_users', 'popularity', 'num_favorites', 'rank',                           # Other important
        'average_episode_duration', 'rating', 'start_season_year',                         # attributes
        'start_season_season', 'broadcast_day_of_the_week', 'broadcast_start_time',   

        'genres', 'studios',                                                               # Multivalued attributes
        'synopsis', 'nsfw', 'created_at', 'updated_at','recommendations'                                   # Description, MyAnimeList edits
        
        'main_picture_medium', 'main_picture_large',                                       # Media data
        'alternative_titles_en', 'alternative_titles_ja', 'alternative_titles_synonyms']   # Other titles


anime = anime[order]

# Save to csv
anime.to_csv('data/anime.csv', index=False)

pd.options.display.max_columns = None
anime.head(1)

KeyError: 'recommendations'

## Load Anime

In [16]:
from ast import literal_eval
import pandas as pd

data= pd.read_csv('data/anime.csv')

columns_dtype_datetime = ['start_date', 'end_date', 'created_at', 'updated_at']
for col in columns_dtype_datetime:
    data[col] = pd.to_datetime(data[col])

columns_dtype_Int64 = ['num_episodes', 'popularity', 'rank', 'start_season_year']
for col in columns_dtype_Int64:
    data[col] = data[col].astype('Int64')

columns_dtype_list = ['genres', 'studios', 'alternative_titles_synonyms']
for col in columns_dtype_list:
    data[col] = data[col].apply(literal_eval)

data['broadcast_start_time'] = pd.to_datetime(data['broadcast_start_time']).dt.time   # Time of day

data['average_episode_duration'] = pd.to_timedelta(data['average_episode_duration'])  # Duration

data.head(1)

Unnamed: 0,id,title,media_type,mean,num_scoring_users,status,num_episodes,start_date,end_date,source,num_list_users,popularity,num_favorites,rank,average_episode_duration,rating,start_season_year,start_season_season,broadcast_day_of_the_week,broadcast_start_time,genres,studios,synopsis,nsfw,created_at,updated_at,main_picture_medium,main_picture_large,alternative_titles_en,alternative_titles_ja,alternative_titles_synonyms
0,5114,Fullmetal Alchemist: Brotherhood,tv,9.13,1866190,finished_airing,64,2009-04-05,2010-07-04,manga,2923909,3,204136,2,0 days 00:24:20,r,2009,spring,sunday,17:00:00,"[Action, Adventure, Drama, Fantasy, Military, ...",[Bones],After a horrific alchemy experiment goes wrong...,white,2008-08-21 03:35:22,2022-04-18 05:06:13,https://api-cdn.myanimelist.net/images/anime/1...,https://api-cdn.myanimelist.net/images/anime/1...,Fullmetal Alchemist: Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,[Hagane no Renkinjutsushi: Fullmetal Alchemist...


In [17]:
del data