## Zenn Articles

Website: https://zenn.dev/


## Setup


In [1]:
import requests
from bs4 import BeautifulSoup
import time
import json
from datasets import Dataset
from tqdm import tqdm

In [2]:
BASE_URL = "https://zenn.dev"

## Cache URLs

GET `https://zenn.dev/articles?page=1` ~ `https://zenn.dev/articles?page=????`


In [None]:
page = 1
articles_pages = []

client = requests.session()

print("Fetching articles pages...")

while True:
    if page % 100 == 0:
        print("page:", page)

    url = f"{BASE_URL}/articles?page={page}"
    res = client.get(url)
    res.encoding = res.apparent_encoding
    if res.status_code != 200:
        if res.status_code == 404:
            # end of pages
            break
        raise Exception(f"page {page} got {res.status_code}!")

    articles_pages.append(BeautifulSoup(res.content, "lxml"))

    page += 1

    time.sleep(0.01)

articles_pages

## Parse URLs


In [13]:
def parse_urls(soup: BeautifulSoup):
    article_link_els = soup.select("article > a")

    urls = [a.get("href") for a in article_link_els]
    assert all([url is not None for url in urls])

    urls = [f"{BASE_URL}{url}" for url in urls]

    return urls

In [14]:
article_urls = []

for page in tqdm(articles_pages):
    article_urls.extend(parse_urls(page))

print(f"Total: {len(article_urls)} urls")

100%|██████████| 2119/2119 [00:08<00:00, 263.57it/s]

Total: 87045 urls





### Save Article URLs


In [15]:
with open("urls.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(article_urls))

## ...or use pre-collected URLs


In [3]:
from datasets import load_dataset

urls_ds = load_dataset("p1atdev/zenn-articles-20240115", split="train")
assert isinstance(urls_ds, Dataset)

urls_ds

Dataset({
    features: ['text'],
    num_rows: 87045
})

In [4]:
article_urls = [row["text"] for row in urls_ds]
len(article_urls)

87045