In [2]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import string

async def scrape_page(page, url):
    await page.goto(url, timeout=0)
    # Wait for at least one name to appear
    await page.wait_for_selector("ul.name-list li h3", timeout=5000)
    elements = await page.query_selector_all("ul.name-list li h3")
    names = [await el.inner_text() for el in elements if (await el.inner_text()).strip()]
    return names

async def scrape_letter(playwright, gender, letter):
    browser = await playwright.chromium.launch(headless=True)
    context = await browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/118.0.5993.118 Safari/537.36"
    )
    page = await context.new_page()

    results = []
    page_number = 1
    while True:
        if gender == "girl":
            url = f"https://japanese-names.info/first-names/gender/girl-name/start-with-{letter}-girl/page/{page_number}"
        else:
            if page_number == 1:
                url = f"https://japanese-names.info/first-names/gender/boy-name/start-with-{letter}"
            else:
                url = f"https://japanese-names.info/first-names/gender/boy-name/start-with-{letter}/page/{page_number}"

        try:
            names = await scrape_page(page, url)
            if not names:
                break
            results.extend(names)
            page_number += 1
        except:
            break  # stop if page doesn't exist or any error occurs

    await browser.close()
    return results

async def scrape_all(gender):
    async with async_playwright() as playwright:
        tasks = [scrape_letter(playwright, gender, letter) for letter in string.ascii_lowercase]
        all_names = await asyncio.gather(*tasks)

    flat_list = [name for sublist in all_names for name in sublist]
    df = pd.DataFrame(flat_list, columns=[f"{gender}_names"])
    return df

async def main():
    girls_task = scrape_all("girl")
    boys_task = scrape_all("boy")
    girls_df, boys_df = await asyncio.gather(girls_task, boys_task)

    print(f"Found {len(girls_df)} girl names ✅")
    print(f"Found {len(boys_df)} boy names ✅")

    girls_df.to_csv("japanese_girls.csv", index=False)
    boys_df.to_csv("japanese_boys.csv", index=False)

# For Jupyter
await main()


Found 826 girl names ✅
Found 1030 boy names ✅
