In [18]:
import os
import re
import json

from tqdm import tqdm
from openai import OpenAI
from collections import defaultdict

from dotenv import load_dotenv
load_dotenv("../keys.env")

openai_api_key = os.getenv('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key

In [21]:
document_path = "../dataset/processed_documents.jsonl"

output_path = "../dataset/custom"
os.makedirs(f"{output_path}/arc", exist_ok=True)
os.makedirs(f"{output_path}/mmlu", exist_ok=True)

In [3]:
mmlu_documents = []
arc_documents = []
arc_file_path = f'{output_path}/arc_documents.jsonl'
mmlu_file_path = f'{output_path}/mmlu_documents.jsonl'

In [26]:
def extract_domain(src_value):
    # Extract the part after the second double underscore
    parts = src_value.split('__')
    if len(parts) >= 3:
        return parts[1]
    return None


domains_documents = defaultdict(list)

# Classify documents by their domain and save them
with open(mmlu_file_path, 'w', encoding='utf-8') as mmlu_file:
    for doc in mmlu_documents:
        json.dump(doc, mmlu_file, ensure_ascii=False)
        mmlu_file.write('\n')

        # Extract domain and group documents by domain
        domain = extract_domain(doc['src'])
        if domain:
            domains_documents[domain].append(doc)

# Save documents into separate files based on domain
for domain, docs in domains_documents.items():
    domain_file_path = f'{output_path}/mmlu/{domain}_documents.jsonl'
    with open(domain_file_path, 'w', encoding='utf-8') as domain_file:
        for doc in docs:
            json.dump(doc, domain_file, ensure_ascii=False)
            domain_file.write('\n')

In [13]:
client = OpenAI()
model = "gpt-4o"

def clean_json_response(response):
    # 코드 블록(예: ```json, ```) 제거
    cleaned_response = re.sub(r'```(?:json)?', '', response).strip()
    return cleaned_response

def generate_domain(document, model:str, client:OpenAI):
    prompt = (
        "당신은 주어진 문서를 읽고 이해하여 핵심을 파악하는 언어 전문가입니다. "
        " 해당 문서가 어떤 분야에 해당하는지 영어로 하나의 도메인을 정해야합니다. "
        " 만약 두 개 이상의 단어로 도메인을 만드는 경우 '_'로 단어들을 연결하세요."
        " 반환하는 형식은 반드시 JSON 포맷이어야 하며, 모든 문자열은 쌍따옴표로 감싸야 합니다. "
        " 형식은 다음과 같아야 합니다: "
        '{ "domain" : "생성한 도메인"}. '
    )

    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role" : "system", "content" : prompt},
            {"role" : "user", "content" : document}
        ],
    )
    
    response = completion.choices[0].message.content
    response = clean_json_response(response)
    
    try:
        json_response = json.loads(response)
    except json.JSONDecodeError:
        return {"error": "Invalid JSON response", "response": response}
    
    return json_response

In [14]:
with open(arc_file_path, 'w', encoding='utf-8') as arc_file:
    for doc in arc_documents:
        json.dump(doc, arc_file, ensure_ascii=False)
        arc_file.write('\n')

In [19]:
for doc in tqdm(arc_documents):
    document_content = doc["content"]
    domain_info = generate_domain(document_content, model, client)
    doc["domain"] = domain_info['domain']

100%|██████████| 2047/2047 [23:52<00:00,  1.43it/s] 


In [20]:
documents_by_domain = {}
for doc in arc_documents:
    domain = doc["domain"]
    if domain not in documents_by_domain:
        documents_by_domain[domain] = []
    documents_by_domain[domain].append(doc)

In [27]:
print(len(documents_by_domain))

for domain in sorted(documents_by_domain):
    print(domain)

257
Acoustic_Technology
Acoustics
Aerospace_Engineering
Agricultural_Science
Agriculture
Agriculture_Environmental_Science
Agriculture_and_Fisheries
Anatomy
Anatomy_Education
Animal_Behavior
Animal_Behavior_Research
Animal_Nutrition
Animal_Training
Aquaculture
Aquarium_horticulture
Architecture
Art_and_Design
Astrobiology
Astronomy
Astrophysics
Atmospheric_Science
Automotive
Automotive_Engineering
Automotive_Industry
Automotive_Racing
Automotive_Safety
Aviation
Behavioral_Science
Biochemistry
Biology
Biomedical_Research
Biomedical_Science
Biophysics
Biotechnology
Botany
Broadcasting
Cardiology
Carpentry
Cell_Biology
Chemistry
Chemistry_Education
Chemistry_Safety
Civil_Engineering
Climate_Change
Climate_Science
Climatology
Communication_Technology
Construction
Consumer_Electronics
Cooking
Cosmology
Culinary
Culinary_Arts
Cultural_Anthropology
Data_Analysis
Data_Visualization
Dermatology
Developmental_Biology
Earth_Science
Earth_science
Ecology
Education
Education_Safety
Education_Techno

In [22]:
for domain, docs in documents_by_domain.items():
    domain_file_path = f'{output_path}/arc/{domain}_documents.jsonl'
    with open(domain_file_path, 'w', encoding='utf-8') as domain_file:
        for doc in docs:
            json.dump(doc, domain_file, ensure_ascii=False)
            domain_file.write('\n')