## Zenn Articles

Website: https://zenn.dev/


## Setup


In [48]:
import requests
from bs4 import BeautifulSoup
import time
import json
from datasets import Dataset, load_dataset, load_from_disk, concatenate_datasets
from tqdm import tqdm
import numpy as np

In [2]:
BASE_URL = "https://zenn.dev"

## Cache URLs

GET `https://zenn.dev/articles?page=1` ~ `https://zenn.dev/articles?page=????`


In [None]:
page = 1
articles_pages = []

client = requests.session()

print("Fetching articles pages...")

while True:
    if page % 100 == 0:
        print("page:", page)

    url = f"{BASE_URL}/articles?page={page}"
    res = client.get(url)
    res.encoding = res.apparent_encoding
    if res.status_code != 200:
        if res.status_code == 404:
            # end of pages
            break
        raise Exception(f"page {page} got {res.status_code}!")

    articles_pages.append(BeautifulSoup(res.content, "lxml"))

    page += 1

    time.sleep(0.01)

articles_pages

## Parse URLs


In [13]:
def parse_urls(soup: BeautifulSoup):
    article_link_els = soup.select("article > a")

    urls = [a.get("href") for a in article_link_els]
    assert all([url is not None for url in urls])

    urls = [f"{BASE_URL}{url}" for url in urls]

    return urls

In [14]:
article_urls = []

for page in tqdm(articles_pages):
    article_urls.extend(parse_urls(page))

print(f"Total: {len(article_urls)} urls")

100%|██████████| 2119/2119 [00:08<00:00, 263.57it/s]

Total: 87045 urls





### Save Article URLs


In [15]:
with open("urls.txt", "w", encoding="utf-8") as file:
    file.write("\n".join(article_urls))

## ...or use pre-collected URLs


In [6]:
from datasets import load_dataset, Dataset

urls_ds = load_dataset("p1atdev/zenn-articles-20240115", split="train")
assert isinstance(urls_ds, Dataset)

urls_ds

Dataset({
    features: ['text'],
    num_rows: 87045
})

In [7]:
article_urls = [row["text"] for row in urls_ds]
len(article_urls)

87045

## Get all HTMLs

current_chunk_idx を 0~7 で繰り返す


In [14]:
collected_urls = []

In [15]:
chunks = np.array_split(article_urls, 8)

In [42]:
current_chunk_idx = 7

In [43]:
chunk_articles = []

In [44]:
client = requests.Session()

for url in tqdm(chunks[current_chunk_idx]):
    if url in collected_urls:
        continue

    res = client.get(url, timeout=3000)

    if res.status_code != 200:
        if res.status_code in [400, 401, 403, 404]:
            continue

        raise Exception(f"{url} got {res.status_code}!")

    soup = BeautifulSoup(res.content, "lxml", from_encoding="utf-8")

    item = {
        "url": url,
        "html": str(soup),
        "timestamp": time.time(),
    }
    chunk_articles.append(item)

    collected_urls.append(url)

    time.sleep(0.05)  # 申し訳程度の sleep

100%|██████████| 10880/10880 [33:14<00:00,  5.46it/s] 


In [45]:
chunk_ds = Dataset.from_list(chunk_articles)
chunk_ds

Dataset({
    features: ['url', 'html', 'timestamp'],
    num_rows: 10878
})

In [46]:
chunk_ds.save_to_disk(f"./article-chunk{current_chunk_idx}")

Saving the dataset (0/3 shards):   0%|          | 0/10878 [00:00<?, ? examples/s]

## Concat all chunks and push to huggingface


In [49]:
all_chunks = []

for i in range(0, 8):
    all_chunks.append(load_from_disk(f"./article-chunk{current_chunk_idx}"))

articles_ds = concatenate_datasets(all_chunks)
articles_ds

Dataset({
    features: ['url', 'html', 'timestamp'],
    num_rows: 87024
})

In [50]:
articles_ds.push_to_hub("zenn-articles-20240116-html", private=True)

Uploading the dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/6 [00:00<?, ?ba/s]