In [20]:
import numpy as np
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import glob
import os
from PIL import Image
import re

In [67]:
BASE_URL = "https://naruto.fandom.com"
volume_list_url = "/wiki/List_of_Volumes"
res = requests.get(BASE_URL + volume_list_url)
soup = BeautifulSoup(res.text, "html.parser")
tables = soup.find_all("table", {"class": "wikitable"})
# The main Naruto series has 72 volumes, published in two parts of 27 and 45 volumes, respectively. These volumes are in the first two tables on the website.
volumes = tables[0].find_all("div", class_="floatright") + tables[1].find_all(
    "div", class_="floatright"
)
volume_urls = [volume.find("a").get("href") for volume in volumes]

In [77]:
volumes = {}

for volume, volume_url in tqdm(enumerate(volume_urls)):
    res = requests.get(BASE_URL + volume_url)
    soup = BeautifulSoup(res.text, "html.parser")
    infobox = soup.find("table", class_="infobox")
    volumes[volume] = {
        "name": infobox.find("th", class_="mainheader").find("i").text,
        "number": int(
            infobox.find("th", class_="", string=re.compile(r"Volume"))
            .find_next_sibling("td")
            .find(string=True)
            .strip()
        ),
        "release_date": (
            infobox.find("th", class_="", string=re.compile(r"English"))
            .find_next_sibling("td")
            .find(string=True)
            .strip()
        ),
        "cover_url": infobox.find("a", class_="image").get("href"),
    }

72it [00:21,  3.31it/s]


In [80]:
volumes_df = pd.DataFrame.from_dict(volumes, orient="index").set_index("number")
volumes_df["release_date"] = pd.to_datetime(volumes_df["release_date"])
volumes_df.to_csv("data/volumes.csv")

Download all the covers

In [91]:
for i, cover_url in tqdm(enumerate(volumes_df.cover_url.values)):
    res = requests.get(cover_url)
    with open("./data/covers/{}.webp".format(i + 1), "wb") as f:
        f.write(res.content)

72it [00:20,  3.43it/s]


In [89]:
sizes = {}
for file in tqdm(glob.glob("./data/covers/*.jpg")):
    img = Image.open(file)
    if img.size not in sizes:
        sizes[img.size] = 1
    else:
        sizes[img.size] += 1
# Sort by most common size
sizes = sorted(sizes.items(), key=lambda x: x[1], reverse=True)
sizes

100%|██████████| 72/72 [00:00<00:00, 7089.30it/s]


[((761, 1200), 10),
 ((762, 1200), 8),
 ((760, 1200), 5),
 ((764, 1200), 5),
 ((758, 1200), 4),
 ((763, 1200), 4),
 ((767, 1200), 3),
 ((765, 1200), 3),
 ((756, 1194), 3),
 ((759, 1200), 2),
 ((768, 1200), 2),
 ((757, 1200), 2),
 ((744, 1194), 2),
 ((750, 1194), 2),
 ((752, 1200), 1),
 ((753, 1200), 1),
 ((754, 1200), 1),
 ((782, 1200), 1),
 ((1041, 1600), 1),
 ((777, 1200), 1),
 ((768, 1194), 1),
 ((769, 1200), 1),
 ((770, 1200), 1),
 ((766, 1200), 1),
 ((1189, 1800), 1),
 ((1379, 2049), 1),
 ((683, 1037), 1),
 ((686, 1040), 1),
 ((1341, 2060), 1),
 ((786, 1188), 1),
 ((768, 1206), 1)]

Most of the images seem to be around (760, 1200) in size. So lets resize all the images to be that size.

In [92]:
images = glob.glob("data/covers/*")
for image in tqdm(images):
    img = Image.open(image)
    img = img.resize((760, 1200))
    img.save("data/small_covers/" + os.path.basename(image))

100%|██████████| 72/72 [00:06<00:00, 11.37it/s]
