In [10]:
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import string

async def scrape_page(page, url):
    await page.goto(url, timeout=0)
    try:
        await page.wait_for_selector("ul.name-list li h3", timeout=10000)
    except:
        return []

    # Scroll to load dynamic content
    previous_height = 0
    for _ in range(10):  # scroll several times
        current_height = await page.evaluate("document.body.scrollHeight")
        if current_height == previous_height:
            break
        previous_height = current_height
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
        await page.wait_for_timeout(1000)

    # ✅ Make sure elements have loaded
    elements = await page.query_selector_all("ul.name-list li h3")
    names = []
    for el in elements:
        text = (await el.inner_text()).strip()
        if text:
            names.append(text)
    return names



async def scrape_letter(playwright, gender, letter):
    browser = await playwright.chromium.launch(headless=True)
    context = await browser.new_context(
        user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                   "AppleWebKit/537.36 (KHTML, like Gecko) "
                   "Chrome/118.0.5993.118 Safari/537.36"
    )

    results = []
    page_number = 1
    while True:
        # Build URL
        url = f"https://japanese-names.info/first-names/gender/{gender}-name/start-with-{letter}-{gender}/page/{page_number}"

        try:
            # ✅ Open a *new tab* for each page
            page = await context.new_page()
            names = await scrape_page(page, url)
            await page.close()  # ✅ Close tab to free memory

            if not names:
                break
            results.extend(names)
            page_number += 1
        except Exception as e:
            print(f"❌ Error on {url}: {e}")
            break  # stop if page doesn't exist or any error occurs

    await browser.close()
    return results



async def scrape_all(gender):
    async with async_playwright() as playwright:
        tasks = [scrape_letter(playwright, gender, letter) for letter in string.ascii_lowercase]
        all_names = await asyncio.gather(*tasks)

    flat_list = [name for sublist in all_names for name in sublist]
    df = pd.DataFrame(flat_list, columns=[f"{gender}_names"])
    return df

async def main():
    girls_task = scrape_all("girl")
    #boys_task = scrape_all("boy")
    #girls_df, boys_df = await asyncio.gather(girls_task, boys_task)
    [girls_df] = await asyncio.gather(girls_task)

    print(f"Found {len(girls_df)} girl names ✅")
    #print(f"Found {len(boys_df)} boy names ✅")

    #girls_df.to_csv("japanese_girls.csv", index=False)
    #boys_df.to_csv("japanese_boys.csv", index=False)

# For Jupyter
girls_df = await main()


Found 0 girl names ✅


In [9]:
import asyncio
from playwright.async_api import async_playwright

URL = "https://japanese-names.info/first-names/gender/girl-name/start-with-a-girl"

async def scrape_names(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/118.0.5993.118 Safari/537.36"
        )
        page = await context.new_page()
        await page.goto(url, timeout=0)

        # Wait for at least one name to appear
        await page.wait_for_selector("ul.name-list li h3", timeout=10000)

        # Scroll to load all names
        previous_height = await page.evaluate("document.body.scrollHeight")
        while True:
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
            await page.wait_for_timeout(1500)  # wait for new content to load
            current_height = await page.evaluate("document.body.scrollHeight")
            if current_height == previous_height:
                break
            previous_height = current_height

        # Extract names with proper inner_text evaluation
        name_elements = await page.query_selector_all("ul.name-list li h3")
        names = []
        for el in name_elements:
            # Use evaluate to get fully rendered text
            text = await el.evaluate("(node) => node.textContent.trim()")
            if text:
                names.append(text)

        await browser.close()
        return names

async def main():
    names = await scrape_names(URL)
    print(f"Found {len(names)} names ✅")
    for name in names:
        print(name)

# In Jupyter
await main()


Found 60 names ✅
Akari
Ayaka
Arisa
Asana
Akina
Aki
Aika
Airi
Ayami
Akiri
Aoi
Ayana
Asuka
Ayuka
Ami
Akiho
Ayumi
Asaka
Akie
Akiko
Akane
Ayu
Aina
Arisu
Asami
Akika
Amika
Ayame
Ayuna
Aya
Aria
Ai
Arika
Amiri
Aimi
Ako
Akira
Akihi
Ayane
Asahi
Ayako
Asune
Anna
Arina
Asako
Ayano
Asa
Asuna
Akana
Ayuri
Aiko
Akiha
Ayae
Ayari
Azusa
Akino
Azumi
Amina
Akiyo
Aisa


In [12]:
import asyncio
import string
import pandas as pd
from playwright.async_api import async_playwright

BASE_URL = "https://japanese-names.info/first-names/gender/{gender}-name/start-with-{letter}"
GIRL = "girl"
BOY = "boy"

async def scrape_page(page, url):
    """Scrape one page and return list of names"""
    await page.goto(url, timeout=0)
    # Wait for names to load
    try:
        await page.wait_for_selector("ul.name-list li h3", timeout=5000)
    except:
        return []  # no names on this page

    # Scroll to ensure dynamic content loads
    previous_height = await page.evaluate("document.body.scrollHeight")
    while True:
        await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
        await page.wait_for_timeout(1000)
        current_height = await page.evaluate("document.body.scrollHeight")
        if current_height == previous_height:
            break
        previous_height = current_height

    # Extract names
    name_elements = await page.query_selector_all("ul.name-list li h3")
    names = [await el.evaluate("(node) => node.textContent.trim()") for el in name_elements if await el.evaluate("(node) => node.textContent.trim()")]
    return names

async def scrape_gender(gender):
    all_data = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                       "Chrome/118.0.5993.118 Safari/537.36"
        )
        page = await context.new_page()

        for letter in ["a"] :#string.ascii_lowercase:
            page_num = 1
            while True:
                if gender == GIRL:
                    url = f"{BASE_URL.format(gender=gender, letter=letter)}/page/{page_num}/"
                else:  # boys have no page number on single-page
                    url = f"{BASE_URL.format(gender=gender, letter=letter)}" if page_num == 1 else f"{BASE_URL.format(gender=gender, letter=letter)}/page/{page_num}/"

                names = await scrape_page(page, url)
                if not names:
                    break  # stop if page has no names

                # Append data
                for name in names:
                    all_data.append({"letter": letter, "page": page_num, "name": name})

                page_num += 1

        await browser.close()

    return pd.DataFrame(all_data)

async def main():
    print("Scraping girl names...")
    girls_df = await scrape_gender(GIRL)
    print(f"Found {len(girls_df)} girl names ✅")

    print("Scraping boy names...")
    #boys_df = await scrape_gender(BOY)
    #print(f"Found {len(boys_df)} boy names ✅")

    # Save to CSV if needed
    #girls_df.to_csv("japanese_girl_names.csv", index=False)
    #boys_df.to_csv("japanese_boy_names.csv", index=False)

    return girls_df#, boys_df

# In Jupyter, use:
girls_df = await main()


Scraping girl names...
Found 748 girl names ✅
Scraping boy names...
