In [1]:
import asyncio, nest_asyncio, re, datetime
from pyppeteer import launch
import pandas as pd

nest_asyncio.apply()

In [76]:
import asyncio
import nest_asyncio
from pyppeteer import launch

def extract_ptcg_rarity_and_card_name(content):
    pattern = r'(UR|AR|SR|SAR)\s+([^\s\n]+)'
    matches = re.findall(pattern, content)
    return matches[0][0], matches[0][1]

def extract_opcg_rarity_and_card_name(content):
    pattern = r'(?:P-(?:SEC|R)|SEC|P-L|L|SR|P-SR)\s+[^\s\n]+(?:\([^\)]+\))?'
    matches = re.search(pattern, content).group()
    return matches.split(' ')[0], matches.split(' ')[1]

def extract_ptcg_card_index(content):
    pattern = r'\d{3}/\d{3}'
    match = re.search(pattern, content)
    if match:
        return match.group()
    return None

def extract_opcg_card_index(content):
    pattern = r'OP\d{2}-\d{3}'
    match = re.search(pattern, content)
    if match:
        return match.group()
    return None

def extract_card_price(content):
    pattern = r'\d{1,3}(?:,\d{3})* 円'
    match = re.search(pattern, content)
    if match:
        price_str = match.group()
        return int(price_str.replace(',', '').replace(' 円', ''))
    return None

async def extract_content(tcg_type, card_set, i):
    browser = await launch(headless=True)
    try:
        page = await browser.newPage()
        await page.goto(f'https://yuyu-tei.jp/sell/{tcg_type}/card/{card_set}/{i}', timeout=60000)
        await page.waitForSelector('.fw-bold', timeout=60000)
        fw_bold_texts = await page.evaluate('''() => {
            const boldElements = document.querySelectorAll('.fw-bold');
            return Array.from(boldElements).map(element => element.innerText).join('\\n');
        }''')
        return fw_bold_texts
    except Exception as e:
        print(f"Error for {card_set}/{i}: {e}")
        return None
    finally:
        await browser.close()



In [18]:
pkm_df = pd.DataFrame(columns=['card_set','card_rarity', 'card_name', 'card_index', 'card_price', 'created_time'])

tcg_type = 'poc'

card_set = 'sv09a'

for i in range(10064,10093):
    content = asyncio.run(extract_content(tcg_type, card_set, i))
    card_rarity, card_name = extract_ptcg_rarity_and_card_name(content)
    card_index = extract_ptcg_card_index(content)
    card_price = extract_card_price(content)
    created_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    pkm_df.loc[len(pkm_df)] = [card_set, card_rarity, card_name, card_index, card_price, created_time]

card_set = 'sv02a'

for i in range(10472,10517):
    content = asyncio.run(extract_content(tcg_type, card_set, i))
    card_rarity, card_name = extract_ptcg_rarity_and_card_name(content)
    card_index = extract_ptcg_card_index(content)
    card_price = extract_card_price(content)
    created_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    pkm_df.loc[len(pkm_df)] = [card_set, card_rarity, card_name, card_index, card_price, created_time]


In [19]:
pkm_df.to_csv(f'../data/ptcg/{datetime.datetime.now().strftime("%Y%m%d")}.csv', index=False)

In [16]:
pkm_df

Unnamed: 0,card_set,card_rarity,card_name,card_index,card_price,created_time
0,sv09a,AR,ヤンヤンマ,064/063,420,2025-03-19 02:36:10
1,sv09a,AR,シロナのロズレイド,065/063,1980,2025-03-19 02:36:14
2,sv09a,AR,シェイミ,066/063,1280,2025-03-19 02:36:18
3,sv09a,AR,イワパレス,067/063,680,2025-03-19 02:36:23
4,sv09a,AR,カミツオロチ,068/063,420,2025-03-19 02:36:26


In [79]:
from pyppeteer import launch
import asyncio

async def get_links():
    browser = await launch(headless=True)
    try:
        page = await browser.newPage()
        await page.goto('https://yuyu-tei.jp/sell/opc/s/search?search_word=&rare=P-SEC&type=&kizu=0', timeout=60000)
        hyperlinks = await page.evaluate('''() => {
            const links = document.querySelectorAll('a');
            return Array.from(links).map(link => link.href);
        }''')
        return hyperlinks
    except Exception as e:
        print(f"Error: {e}")
        return None
    finally:
        await browser.close()

links = asyncio.run(get_links())
links = set(sorted([l for l in links if 'card' in l]))

In [80]:
op_df = pd.DataFrame(columns=['card_set','card_rarity', 'card_name', 'card_index', 'card_price', 'created_time'])

for link in links:
    tcg_type = link.split('/')[-4]
    card_set = link.split('/')[-2]
    i = link.split('/')[-1]
    print(tcg_type, card_set, i, link)
    content = asyncio.run(extract_content(tcg_type, card_set, i))
    try:
        card_rarity, card_name = extract_opcg_rarity_and_card_name(content)
    except:
        continue
    card_index = extract_opcg_card_index(content)
    card_price = extract_card_price(content)
    created_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    op_df.loc[len(op_df)] = [card_set, card_rarity, card_name, card_index, card_price, created_time]


opc prb01 10135 https://yuyu-tei.jp/sell/opc/card/prb01/10135
opc op03 10147 https://yuyu-tei.jp/sell/opc/card/op03/10147
opc eb02 10090 https://yuyu-tei.jp/sell/opc/card/eb02/10090
opc eb02 10070 https://yuyu-tei.jp/sell/opc/card/eb02/10070
opc op10 10058 https://yuyu-tei.jp/sell/opc/card/op10/10058
opc eb02 10091 https://yuyu-tei.jp/sell/opc/card/eb02/10091
opc op11 10144 https://yuyu-tei.jp/sell/opc/card/op11/10144
opc op11 10145 https://yuyu-tei.jp/sell/opc/card/op11/10145
opc prb01 10026 https://yuyu-tei.jp/sell/opc/card/prb01/10026
opc eb02 10006 https://yuyu-tei.jp/sell/opc/card/eb02/10006
opc op10 10141 https://yuyu-tei.jp/sell/opc/card/op10/10141
opc op10 10046 https://yuyu-tei.jp/sell/opc/card/op10/10046
opc op03 10148 https://yuyu-tei.jp/sell/opc/card/op03/10148
opc eb02 10002 https://yuyu-tei.jp/sell/opc/card/eb02/10002
opc op10 10037 https://yuyu-tei.jp/sell/opc/card/op10/10037
opc eb02 10025 https://yuyu-tei.jp/sell/opc/card/eb02/10025
opc prb01 10089 https://yuyu-tei.jp/

In [81]:
op_df.to_csv(f'../data/opcg/{datetime.datetime.now().strftime("%Y%m%d")}.csv', index=False)

In [86]:
def extract_opcg_card_index(content):
    pattern = r'(?:eb|st|op)\d{2,}-\d{2,}'
    match = re.search(pattern, content)
    if match:
        return match.group()
    return None

In [88]:
extract_opcg_card_index(w)