## 試験ドットコム系

Websites:

- https://www.sc-siken.com/
- https://www.ap-siken.com/
- https://denkou2-siken.com/
- https://www.db-siken.com/
- https://www.pm-siken.com/
- https://www.nw-siken.com/


In [None]:
%pip install html5lib

## Setup


In [1]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from datasets import Dataset
import time

In [2]:
TARGET_SITES = [
    "https://www.sc-siken.com",
    "https://www.ap-siken.com",
    "https://denkou2-siken.com",
    "https://www.db-siken.com",
    "https://www.pm-siken.com",
    "https://www.nw-siken.com",
]

## Get past exams


In [None]:
top_pages: list[BeautifulSoup] = []

client = requests.Session()

for url in TARGET_SITES:
    res = client.get(url)
    if res.status_code != 200:
        raise Exception(f"{url} got {res.status_code}!!")

    res.encoding = res.apparent_encoding

    top_pages.append(BeautifulSoup(res.content, "lxml"))

top_pages

In [None]:
past_exam_urls: list[dict[str, str]] = []

for url, page in zip(TARGET_SITES, top_pages):
    link_els = page.select("ul#testMenu > li > a")
    if len(link_els) == 0:
        link_els = page.select("el#test_menu > li > a")

    urls = [
        {
            "base_url": url,
            "url": f"{url}{el.get('href')}",
        }
        for el in link_els
    ]

    past_exam_urls.extend(urls)

past_exam_urls

In [5]:
len(past_exam_urls)

57

## Cache HTMLs


In [None]:
past_exam_pages: list[dict] = []

client = requests.Session()

for url_data in tqdm(past_exam_urls):
    res = client.get(url_data["url"])
    if res.status_code != 200:
        raise Exception(f"{url} got {res.status_code}!!")

    res.encoding = res.apparent_encoding

    # lxml はエラー
    past_exam_pages.append(
        {
            "base_url": url_data["base_url"],
            "url": url_data["url"],
            # html が不正なので html5lib を使う
            "soup": BeautifulSoup(res.content, "html5lib"),
        }
    )

past_exam_pages[:2]

## Parse HTML and get questions


In [12]:
collected_questions = set()

questions_data = []


def get_questions(base_url: str, url: str, soup: BeautifulSoup):
    qtables = soup.select("table.qtable")
    assert len(qtables) > 0, soup

    tr_els = []
    for qtable in qtables:
        if qtable.select_one("tbody") is not None:
            tr_els.extend(qtable.select("tbody > tr"))
        else:
            tr_els.extend(qtable.select("tr"))

    for tr in tr_els:
        if tr.select_one("th") is not None:
            # 見出しはスキップ
            continue

        is_ok_explanation = tr.select_one("i.ok") is not None
        # 解説なければスキップ
        if not is_ok_explanation:
            continue

        td_els = tr.select("td")
        link = td_els[0].select_one("a").get("href")
        title = td_els[1].text
        category = td_els[2].text

        # 収集済みならスキップ
        if f"{category}.{title}" in collected_questions:
            continue

        questions_data.append(
            {
                "base_url": base_url,
                "url": url + link,
                "title": title,
                "category": category,
            }
        )
        collected_questions.add(f"{category}.{title}")


for page in tqdm(past_exam_pages):
    get_questions(page["base_url"], page["url"], page["soup"])

len(questions_data)

100%|██████████| 57/57 [00:00<00:00, 79.55it/s]


2271

In [13]:
questions_data[:2]

[{'base_url': 'https://www.ap-siken.com',
  'url': 'https://www.ap-siken.com/kakomon/05_aki/q1.html',
  'title': '2進数2けたを表す式',
  'category': '応用数学'},
 {'base_url': 'https://www.ap-siken.com',
  'url': 'https://www.ap-siken.com/kakomon/05_aki/q2.html',
  'title': '主成分分析',
  'category': '応用数学'}]

## Cache HTMLs


In [28]:
client = requests.Session()

for i, data in tqdm(enumerate(questions_data)):
    res = client.get(data["url"])

    if res.status_code != 200:
        raise Exception(f"{url} got {res.status_code}!!")

    res.encoding = res.apparent_encoding

    # html が不正なので html5lib を使う
    soup = BeautifulSoup(res.content, "html5lib")

    questions_data[i]["html"] = soup

    time.sleep(0.1)  # avoid 429

print("Done!")

2271it [08:42,  4.35it/s]

Done!





## Get QA details


In [35]:
answer_char_map = {
    "ア": 0,
    "イ": 1,
    "ウ": 2,
    "エ": 3,
}

In [51]:
def get_question_details(soup: BeautifulSoup):
    # 午後問題かも
    if soup.select_one("a#splitWindowBtn") is not None:
        return None

    question: dict = {
        "question_body": None,
        "choice_0": None,
        "choice_1": None,
        "choice_2": None,
        "choice_3": None,
        # "answer_char": None,
        "answer_num": None,
        "explanation": None,
        "explanation_choice_0": None,
        "explanation_choice_1": None,
        "explanation_choice_2": None,
        "explanation_choice_3": None,
    }

    mondai = soup.select_one("section#mondai")
    if mondai is None:
        mondai = soup.select_one("div#mondai")
    if mondai is None:
        mondai = soup.select_one("article#mondai")
    assert mondai is not None, soup

    mondai_text = mondai.text
    question["question_body"] = mondai_text

    choices = soup.select("div.ansbg > ul > li > span")
    if len(choices) == 0:
        return None

    for i, choice in enumerate(choices):
        question[f"choice_{i}"] = choice.text

    answer_char_el = soup.select_one("span#answerChar")
    assert answer_char_el is not None

    answer_char = answer_char_el.text
    answer_num = answer_char_map[answer_char_el.text]

    # question["answer_char"] = answer_char
    question["answer_num"] = answer_num

    kaisetsu = soup.select_one("div#kaisetsu")
    assert kaisetsu is not None

    if kaisetsu.select_one("ul > li.lia") is not None:
        kaisetsu_choice_els = kaisetsu.select("ul > li")
        assert len(kaisetsu_choice_els) > 0

        for i, el in enumerate(kaisetsu_choice_els):
            question[f"explanation_choice_{i}"] = el.text
            el.decompose()

    question["explanation"] = kaisetsu.text

    return question

    # print(kaisetsu.text)


all_questions = []

for data in tqdm(questions_data):
    # print(data["url"])
    question = get_question_details(data["html"])
    if question is None:
        continue
    question["base_url"] = data["base_url"]
    question["url"] = data["url"]
    all_questions.append(question)

len(all_questions)

100%|██████████| 2271/2271 [00:08<00:00, 265.30it/s]


1879

In [53]:
all_questions[:2]

[{'question_body': '2けたの2進数x1x2が表す整数をxとする。2進数x2x1が表す整数を，xの式で表したものはどれか。ここで，int(r)は非負の実数rの小数点以下を切り捨てた整数を表す。',
  'choice_0': '2x＋4int(x2)',
  'choice_1': '2x＋5int(x2)',
  'choice_2': '2x−3int(x2)',
  'choice_3': '2x−4int(x2)',
  'answer_num': 2,
  'explanation': 'x(エックス)と×(かける)が紛らわしいので、解説中では乗算の演算子を * としています。整数xは10進数で x1*2 + x2 なので、選択肢中の2xは10進数で以下のように示すことができます。\u30002x = x1*4 + x2*2\u3000…①次に、選択肢中の int(x2) について考えます。x2は、xを右に1ビットシフト（12）させたものなので、\u3000x1x2→(右へ1ビットシフト)→x1.x2 （"."は小数点）さらに、int()は整数部を取り出す操作なので、\u3000int(x1.x2) = x1つまり、int(x2) は x1 と同じ値ということになります。\u3000int(x2) = x1\u3000…②①と同様に、2進数x2x1を10進数で表すと x2*2 + x1 です。これを先程の①と比べると、両者の差分は x1 が3つ分となっています。\u3000x1*4 + x2*2 - x1*3 = x2*2 + x1\u3000…③①②より 2x = x1*4＋x2*2、x1 = int(x2) なので、③の左辺の該当部分を置き換えると、以下のように表すことができます。\u30002x−3int(x2)＝x2*2＋x1したがって「ウ」の式が適切です。【別解】ここまでがこの設問の正しい理解ですが、実際の試験本番では x1＝1，x2＝1 として、\u3000x1x2＝11(2)＝3(10)\u3000x2x1＝11(2)＝3(10)\u3000int(x2)＝1\u30003＝2*3−a\u3000a＝3で「ウ」が正解としたり、x1＝1，x2＝0 として、\u3000x1x2＝10(2)＝2(10)\u3000x2x1＝01(

In [54]:
all_questions_ds = Dataset.from_list(all_questions)
all_questions_ds

Dataset({
    features: ['question_body', 'choice_0', 'choice_1', 'choice_2', 'choice_3', 'answer_num', 'explanation', 'explanation_choice_0', 'explanation_choice_1', 'explanation_choice_2', 'explanation_choice_3', 'base_url', 'url'],
    num_rows: 1879
})

## Push to huggingface


In [55]:
all_questions_ds.push_to_hub("siken-dot-com-20240117", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]