In [22]:
from datasets import load_dataset

ds = load_dataset("nakasyou/eiken-pdf", split="train", data_files=["metadata.jsonl"])
ds_2020 = ds.filter(lambda x: x["year"] >= 2020)


Generating train split: 630 examples [00:00, 36747.11 examples/s]
Filter: 100%|██████████| 630/630 [00:00<00:00, 47430.70 examples/s]


In [13]:
from huggingface_hub import hf_hub_download
from tqdm import tqdm

def download_pdf_by_row(row):
    repo_id = "nakasyou/eiken-pdf"
    local_path = hf_hub_download(
        repo_id=repo_id,
        repo_type="dataset",
        filename=row["file_name"],
    )
    return local_path


In [4]:
# download test data
from format_answer import get_qas_by_pdf, llmify_qas
first_answer = download_pdf_by_row(next(
    iter(ds_2020.filter(lambda x: x["filetype"] == "answer"))
))
print(llmify_qas(get_qas_by_pdf(first_answer)))

<section1>
Q1:4
Q2:4
Q3:2
Q4:3
Q5:3
Q6:4
Q7:3
Q8:1
Q9:2
Q10:1
Q11:3
Q12:1
Q13:1
Q14:1
Q15:2
Q16:3
Q17:4
Q18:2
Q19:1
Q20:4
Q21:2
Q22:4
Q23:1
Q24:2
Q25:1
</section1>
<section2>
Q26:4
Q27:3
Q28:2
Q29:1
Q30:4
Q31:2
</section2>
<section3>
Q32:2
Q33:4
Q34:3
Q35:2
Q36:1
Q37:4
Q38:3
Q39:2
Q40:4
Q41:3
</section3>
<section4>
<example>Technological advancements have undoubtedly brought convenience to our
lives. However, the commodification of personal data, digitization of society, and
scarcity of laws have eroded our ability to protect individual privacy in the modern
world.
The Internet has paved the way for the accumulation and trading of data
regarding users' online habits. Scandals involving the sale of users' social media data,
for example, are proof that our digital footprints are now lucrative commodities.
Moreover, penalties for selling this data are a fraction of the revenues gained from
the activity, which may incentivize corporations to continue violating people's
privacy.
Additionall

In [5]:
# download question data
from format_question import format_question
question = format_question(download_pdf_by_row(next(
    iter(ds_2020.filter(lambda x: x["filetype"] == "question" and x["grade"] == "1"))
)))
print(question)


To complete each item, choose the best word or phrase from among
1
the four choices. Then, on your answer sheet, find the number of the
question and mark your answer.

(1) The ancient document was written in a script that for years no one could
( ). Then, finally, a brilliant young scholar worked out its meaning.
1 slander 2 dawdle 3 pledge 4 decipher

(2) After the referee made several serious mistakes during the game, fans showed
their ( ) by booing and shouting at him.
1 infamy 2 clatter 3 splendor 4 disdain

(3) Gold is one of the most ( ) metals. This quality allows it to be shaped
into many different forms and is one reason it is in such high demand.
1 bombastic 2 malleable 3 parched 4 sordid

(4) The CEO said his company’s success throughout the years was a clear
( ) to the wisdom of the policies of the company’s past leaders.
1 prospectus 2 abrasion 3 testament 4 reprisal

(5) During his first term, Governor Smith made many ( ). When he tried to
get reelected, they supported h

In [29]:
from typing import List, Tuple
pdfs: List[
    Tuple[
        Tuple[str, str, str], Tuple[str, str, str]
    ]
] = []
for question in tqdm(
    ds_2020.filter(lambda x: x["filetype"] == "question")
):
    answer = next((x for x in ds if x["filetype"] == "answer" and x["grade"] == question["grade"] and x["year"] == question["year"] and x["administration"] == question["administration"]), None)
    script = next((x for x in ds if x["filetype"] == "script" and x["grade"] == question["grade"] and x["year"] == question["year"] and x["administration"] == question["administration"]), None)

    question_pdf = download_pdf_by_row(question)
    answer_pdf = download_pdf_by_row(answer) if answer else None
    script_pdf = download_pdf_by_row(script) if script else None

    pdfs.append(((question["grade"], question["year"], question["administration"]), (question_pdf, answer_pdf, script_pdf)))

100%|██████████| 107/107 [05:27<00:00,  3.06s/it]


In [32]:
# combine all
import os, sys, importlib
import combine_pdf
importlib.reload(combine_pdf)
for (grade, year, administration), (question_pdf, answer_pdf, script_pdf) in tqdm(pdfs):
    output_path = f"./eiken_combined/{grade}_{year}_{administration}.pdf"
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    pdf = combine_pdf.merge_pdfs(question_pdf, answer_pdf, script_pdf)
    pdf.write(output_path)
    pdf.close()

100%|██████████| 107/107 [01:41<00:00,  1.05it/s]
