In [None]:
!python3 -m pip install --upgrade "pip<24.1"

Collecting pip<24.1
  Using cached pip-24.0-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.0-py3-none-any.whl (2.1 MB)
    torch (>=1.8.*)
           ~~~~~~^[0m[33m
[0mInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.1.1
    Uninstalling pip-25.1.1:
      Successfully uninstalled pip-25.1.1
Successfully installed pip-24.0


In [None]:
!pip install ratsnlp

[33mDEPRECATION: pytorch-lightning 1.6.1 has a non-standard dependency specifier torch>=1.8.*. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [None]:
from google.colab import drive
drive.mount('/gdrive')

import os
os.chdir('/gdrive/My Drive/')
os.makedirs('nlpbook', exist_ok=True)
os.chdir('nlpbook')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
%%writefile data.py
import requests
import json
import time
import random
from datetime import datetime

MAFRA_API_KEY = "c43f9e43df898ac83c17fecf1abcd3e0af0bf29087be02128cf82a9e8679c90c"
MAFRA_BASE_URL = "http://211.237.50.150:7080/openapi"

RECIPE_BASIC_SERVICE = "Grid_20150827000000000226_1"
RECIPE_INGREDIENT_SERVICE = "Grid_20150827000000000227_1"
RECIPE_PROCESS_SERVICE = "Grid_20150827000000000228_1"

class RecipeDataCollector:
    def __init__(self):
        self.api_key = MAFRA_API_KEY
        self.base_url = MAFRA_BASE_URL

    def build_url(self, service_id, start_idx=1, end_idx=1000):
        """API URL 생성"""
        return f"{self.base_url}/{self.api_key}/json/{service_id}/{start_idx}/{end_idx}"

    def fetch_data(self, service_id, max_records=1000):
        """데이터 가져오기 - 1000건씩 분할 처리"""
        all_data = []

        # 1000건씩 분할하여 요청
        for start_idx in range(1, max_records + 1, 1000):
            end_idx = min(start_idx + 999, max_records)

            try:
                url = self.build_url(service_id, start_idx, end_idx)
                response = requests.get(url, timeout=30)
                response.raise_for_status()
                data = response.json()

                if service_id in data and 'row' in data[service_id]:
                    rows = data[service_id]['row']
                    batch_data = rows if isinstance(rows, list) else [rows]
                    all_data.extend(batch_data)
                    print(f"  📦 {start_idx}-{end_idx}: {len(batch_data)}개 수집")
                elif 'result' in data and data['result'].get('code'):
                    print(f"  ❌ API 오류: {data['result']['message']}")
                    break
                else:
                    break  # 더 이상 데이터 없음

                time.sleep(0.5)  # API 부하 방지

            except Exception as e:
                print(f"  배치 {start_idx}-{end_idx} 실패: {e}")
                break

        return all_data

    def collect_recipe_data(self):
        print("농림축산식품 레시피 데이터 수집 시작...")

        basic_data = self.fetch_data(RECIPE_BASIC_SERVICE, 1000)
        ingredient_data = self.fetch_data(RECIPE_INGREDIENT_SERVICE, 5000)
        process_data = self.fetch_data(RECIPE_PROCESS_SERVICE, 3000)

        print(f"기본정보: {len(basic_data)}개")
        print(f"재료정보: {len(ingredient_data)}개")
        print(f"과정정보: {len(process_data)}개")

        # 데이터 통합
        recipes = self.integrate_data(basic_data, ingredient_data, process_data)

        print(f"통합된 레시피: {len(recipes)}개")
        return recipes

    def integrate_data(self, basic_data, ingredient_data, process_data):
        recipes = {}

        for item in basic_data:
            recipe_id = item.get('RECIPE_ID', '')
            if recipe_id:
                recipes[recipe_id] = {
                    'id': recipe_id,
                    'name': item.get('RECIPE_NM_KO', ''),
                    'category': item.get('RECIPE_TY_NM', ''),
                    'cooking_method': item.get('COOKING_MTH_NM', ''),
                    'ingredients': [],
                    'steps': []
                }

        for item in ingredient_data:
            recipe_id = item.get('RECIPE_ID', '')
            if recipe_id in recipes:
                ingredient = item.get('IRDNT_NM', '')
                if ingredient:
                    recipes[recipe_id]['ingredients'].append(ingredient)

        process_dict = {}
        for item in process_data:
            recipe_id = item.get('RECIPE_ID', '')
            if recipe_id in recipes:
                step_no = int(item.get('COOKING_NO', 0))
                step_desc = item.get('COOKING_DC', '')
                if step_desc:
                    if recipe_id not in process_dict:
                        process_dict[recipe_id] = {}
                    process_dict[recipe_id][step_no] = step_desc

        for recipe_id, steps in process_dict.items():
            if recipe_id in recipes:
                sorted_steps = [steps[i] for i in sorted(steps.keys())]
                recipes[recipe_id]['steps'] = sorted_steps

        valid_recipes = []
        for recipe in recipes.values():
            if (recipe['name'] and
                len(recipe['ingredients']) >= 2 and
                len(recipe['steps']) >= 2):
                valid_recipes.append(recipe)

        return valid_recipes

class QADatasetGenerator:
    """KorQuAD 형태의 QA 데이터셋 생성기"""

    def __init__(self):
        self.question_templates = {
            'ingredients': [
                "{recipe_name}에 들어가는 재료는 무엇인가요?",
                "{recipe_name}의 재료를 알려주세요.",
                "{recipe_name} 만들 때 필요한 재료는?",
                "{recipe_name}에 무엇이 들어가나요?",
                "{recipe_name} 재료가 궁금해요."
            ],
            'cooking_steps': [
                "{recipe_name}은 어떻게 만드나요?",
                "{recipe_name} 만드는 방법을 알려주세요.",
                "{recipe_name} 조리법이 궁금해요.",
                "{recipe_name}을 요리하는 순서는?",
                "{recipe_name} 만드는 과정을 설명해주세요."
            ],
            'category': [
                "{recipe_name}은 어떤 종류의 음식인가요?",
                "{recipe_name}의 분류는 무엇인가요?",
                "{recipe_name}은 어떤 카테고리에 속하나요?"
            ],
            'cooking_method': [
                "{recipe_name}은 어떤 조리법으로 만드나요?",
                "{recipe_name}의 조리 방법은 무엇인가요?"
            ]
        }

    def create_context(self, recipe):
        context_parts = []

        # 기본 정보
        context_parts.append(f"{recipe['name']}은 {recipe['category']} 요리입니다.")

        if recipe.get('cooking_method'):
            context_parts.append(f"조리법은 {recipe['cooking_method']}입니다.")

        # 재료 정보
        if recipe['ingredients']:
            ingredients_text = ", ".join(recipe['ingredients'])
            context_parts.append(f"필요한 재료는 {ingredients_text}입니다.")

        # 조리 과정
        if recipe['steps']:
            steps_text = " ".join([f"{i+1}단계: {step}" for i, step in enumerate(recipe['steps'])])
            context_parts.append(f"만드는 방법은 다음과 같습니다. {steps_text}")

        return " ".join(context_parts)

    def find_answer_span(self, context, answer_text):
        start_idx = context.find(answer_text)
        if start_idx != -1:
            return start_idx, start_idx + len(answer_text)

        # 정확한 매치가 안되면 부분 매치 시도
        words = answer_text.split()
        for word in words:
            if len(word) > 2:  # 의미있는 단어만
                start_idx = context.find(word)
                if start_idx != -1:
                    return start_idx, start_idx + len(word)

        return 0, 0

    def generate_qa_pairs(self, recipe):
        qa_pairs = []
        context = self.create_context(recipe)

        # 재료 질문
        if recipe['ingredients']:
            answer = ", ".join(recipe['ingredients'])
            start_idx, end_idx = self.find_answer_span(context, answer)

            for template in self.question_templates['ingredients']:
                question = template.format(recipe_name=recipe['name'])
                qa_pairs.append({
                    'question': question,
                    'context': context,
                    'answer': answer,
                    'answer_start': start_idx
                })

        # 조리법 질문
        if recipe['steps']:
            answer = " ".join([f"{i+1}단계: {step}" for i, step in enumerate(recipe['steps'])])
            start_idx, end_idx = self.find_answer_span(context, answer)

            for template in self.question_templates['cooking_steps']:
                question = template.format(recipe_name=recipe['name'])
                qa_pairs.append({
                    'question': question,
                    'context': context,
                    'answer': answer,
                    'answer_start': start_idx
                })

        # 카테고리 질문
        if recipe['category']:
            answer = recipe['category']
            start_idx, end_idx = self.find_answer_span(context, answer)

            for template in self.question_templates['category']:
                question = template.format(recipe_name=recipe['name'])
                qa_pairs.append({
                    'question': question,
                    'context': context,
                    'answer': answer,
                    'answer_start': start_idx
                })

        # 조리 방법 질문
        if recipe.get('cooking_method'):
            answer = recipe['cooking_method']
            start_idx, end_idx = self.find_answer_span(context, answer)

            for template in self.question_templates['cooking_method']:
                question = template.format(recipe_name=recipe['name'])
                qa_pairs.append({
                    'question': question,
                    'context': context,
                    'answer': answer,
                    'answer_start': start_idx
                })

        return qa_pairs

    def create_korquad_format(self, recipes):
        data = {
            "version": "1.0",
            "data": []
        }

        for recipe in recipes:
            qa_pairs = self.generate_qa_pairs(recipe)

            if qa_pairs:  # QA 쌍이 있는 경우에만 추가
                paragraphs = []

                # 같은 context를 가진 QA들을 그룹화
                context_qa_map = {}
                for qa in qa_pairs:
                    context = qa['context']
                    if context not in context_qa_map:
                        context_qa_map[context] = []
                    context_qa_map[context].append(qa)

                for context, qas in context_qa_map.items():
                    qas_list = []
                    for qa in qas:
                        qas_list.append({
                            "question": qa['question'],
                            "id": f"recipe_{recipe['id']}_{len(qas_list)}",
                            "answers": [{
                                "answer_start": qa['answer_start'],
                                "text": qa['answer']
                            }]
                        })

                    paragraphs.append({
                        "context": context,
                        "qas": qas_list
                    })

                data["data"].append({
                    "title": f"레시피: {recipe['name']}",
                    "paragraphs": paragraphs
                })

        print(f"총 {len(data['data'])}개 레시피에서 QA 데이터 생성 완료")
        return data

def main():
    print("=" * 60)
    print("BERT 기반 레시피 QA 데이터셋 생성기")
    print("=" * 60)

    # 1. 레시피 데이터 수집
    collector = RecipeDataCollector()
    recipes = collector.collect_recipe_data()

    if not recipes:
        print("❌ 레시피 데이터 수집 실패")
        return

    # 2. QA 데이터셋 생성
    qa_generator = QADatasetGenerator()
    qa_dataset = qa_generator.create_korquad_format(recipes)

    # 3. 데이터 저장
    # 원본 레시피 데이터 저장
    with open('data/raw_recipes.json', 'w', encoding='utf-8') as f:
        json.dump(recipes, f, ensure_ascii=False, indent=2)

    # KorQuAD 형태 QA 데이터 저장
    with open('data/recipe_qa_dataset.json', 'w', encoding='utf-8') as f:
        json.dump(qa_dataset, f, ensure_ascii=False, indent=2)

    print(f"✅ 데이터 저장 완료:")
    print(f"  - 원본 레시피: data/raw_recipes.json ({len(recipes)}개)")
    print(f"  - QA 데이터셋: data/recipe_qa_dataset.json ({len(qa_dataset['data'])}개 문서)")

    # 샘플 데이터 출력
    print("\n📝 샘플 QA 데이터:")
    print("-" * 40)
    if qa_dataset['data']:
      for i in range (0, 10) :
          sample = qa_dataset['data'][i]
          if sample['paragraphs']:
              para = sample['paragraphs'][0]
              if para['qas']:
                  qa = para['qas'][0]
                  print(f"제목: {sample['title']}")
                  print(f"질문: {qa['question']}")
                  print(f"답변: {qa['answers'][0]['text']}")
                  print(f"지문: {para['context'][:200]}...")

if __name__ == "__main__":
    import os
    os.makedirs('data', exist_ok=True)
    main()

Writing data.py


In [None]:
!python data.py

BERT 기반 레시피 QA 데이터셋 생성기
농림축산식품 레시피 데이터 수집 시작...
  📦 1-1000: 537개 수집
  📦 1-1000: 1000개 수집
  📦 1001-2000: 1000개 수집
  📦 2001-3000: 1000개 수집
  📦 3001-4000: 1000개 수집
  📦 4001-5000: 1000개 수집
  📦 1-1000: 1000개 수집
  📦 1001-2000: 1000개 수집
  📦 2001-3000: 1000개 수집
기본정보: 537개
재료정보: 5000개
과정정보: 3000개
통합된 레시피: 449개
총 449개 레시피에서 QA 데이터 생성 완료
✅ 데이터 저장 완료:
  - 원본 레시피: data/raw_recipes.json (449개)
  - QA 데이터셋: data/recipe_qa_dataset.json (449개 문서)

📝 샘플 QA 데이터:
----------------------------------------
제목: 레시피: 나물비빔밥
질문: 나물비빔밥에 들어가는 재료는 무엇인가요?
답변: 쌀, 안심, 콩나물, 청포묵, 미나리, 소금, 국간장, 다진파, 다진마늘, 참기름, 고추장, 설탕, 숙주, 도라지, 고사리, 계란, 양지머리
지문: 나물비빔밥은  요리입니다. 필요한 재료는 쌀, 안심, 콩나물, 청포묵, 미나리, 소금, 국간장, 다진파, 다진마늘, 참기름, 고추장, 설탕, 숙주, 도라지, 고사리, 계란, 양지머리입니다. 만드는 방법은 다음과 같습니다. 1단계: 양지머리로 육수를 낸 후 식혀 기름을 걷어낸 후, 불린 쌀을 넣어 고슬고슬하게 밥을 짓는다. 2단계: 안심은 불고기 양념하여 3...
제목: 레시피: 오곡밥
질문: 오곡밥에 들어가는 재료는 무엇인가요?
답변: 멥쌀, 찹쌀, 수수, 차조, 콩, 팥, 소금
지문: 오곡밥은  요리입니다. 필요한 재료는 멥쌀, 찹쌀, 수수, 차조, 콩, 팥, 소금입니다. 만드는 방법은 다음과 같습니다. 1단계: 찹쌀과 멥쌀은 깨끗이 씻어 건져 놓는다. 2단계: 차수수는 붉

In [None]:
import shutil
os.makedirs('/content/Korpora/recipe-qa', exist_ok=True)

shutil.copy('data/recipe_qa_dataset.json', '/content/Korpora/recipe-qa/KorQuAD_v1.0_train.json')
shutil.copy('data/recipe_qa_dataset.json', '/content/Korpora/recipe-qa/KorQuAD_v1.0_dev.json')


'/content/Korpora/recipe-qa/KorQuAD_v1.0_dev.json'

In [None]:
from ratsnlp.nlpbook.qa import QATrainArguments
import torch
args = QATrainArguments(
    pretrained_model_name="beomi/kcbert-base",
    downstream_corpus_name="recipe-qa",
    downstream_model_dir="./checkpoint-recipe-qa",
    max_seq_length=256,
    max_query_length=32,
    doc_stride=64,
    batch_size=32 if torch.cuda.is_available() else 4,
    learning_rate=5e-5,
    epochs=7,
    tpu_cores=0 if torch.cuda.is_available() else 8,
    seed=7,
)


In [None]:
from ratsnlp import nlpbook

nlpbook.set_seed(args)

set seed: 7


In [None]:
nlpbook.set_logger(args)

INFO:ratsnlp:Training/evaluation parameters QATrainArguments(pretrained_model_name='beomi/kcbert-base', downstream_corpus_name='recipe-qa', downstream_corpus_root_dir='/content/Korpora', downstream_model_dir='./checkpoint-recipe-qa', max_seq_length=256, doc_stride=64, max_query_length=32, threads=4, cpu_workers=12, save_top_k=1, monitor='min val_loss', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=7, batch_size=32, fp16=False, tpu_cores=0, tqdm_enabled=True)
INFO:ratsnlp:Training/evaluation parameters QATrainArguments(pretrained_model_name='beomi/kcbert-base', downstream_corpus_name='recipe-qa', downstream_corpus_root_dir='/content/Korpora', downstream_model_dir='./checkpoint-recipe-qa', max_seq_length=256, doc_stride=64, max_query_length=32, threads=4, cpu_workers=12, save_top_k=1, monitor='min val_loss', seed=7, overwrite_cache=False, force_download=False, test_mode=False, learning_rate=5e-05, epochs=7, batch_size=32, fp16=False, tp

In [None]:
import json

with open("data/recipe_qa_dataset.json", encoding="utf-8") as f:
    data = json.load(f)

for i in range (20) :
  sample = data["data"][i]
  context = sample["paragraphs"][0]["context"]
  qa = sample["paragraphs"][0]["qas"][0]
  question = qa["question"]
  answer = qa["answers"][0]["text"]
  answer_start = qa["answers"][0]["answer_start"]

  print("검사:")
  print(f"질문: {question}")
  print(f"답변: {answer}")
  print(f"위치 매치: {context[answer_start:answer_start+len(answer)] == answer}")
  print(f"정답 위치 텍스트: {context[answer_start:answer_start+len(answer)]}")


검사:
질문: 나물비빔밥에 들어가는 재료는 무엇인가요?
답변: 쌀, 안심, 콩나물, 청포묵, 미나리, 소금, 국간장, 다진파, 다진마늘, 참기름, 고추장, 설탕, 숙주, 도라지, 고사리, 계란, 양지머리
위치 매치: True
정답 위치 텍스트: 쌀, 안심, 콩나물, 청포묵, 미나리, 소금, 국간장, 다진파, 다진마늘, 참기름, 고추장, 설탕, 숙주, 도라지, 고사리, 계란, 양지머리
검사:
질문: 오곡밥에 들어가는 재료는 무엇인가요?
답변: 멥쌀, 찹쌀, 수수, 차조, 콩, 팥, 소금
위치 매치: True
정답 위치 텍스트: 멥쌀, 찹쌀, 수수, 차조, 콩, 팥, 소금
검사:
질문: 잡채밥에 들어가는 재료는 무엇인가요?
답변: 당면, 돼지고기, 표고버섯, 호박, 당근, 부추, 청고추, 홍고추, 다진파, 다진마늘, 진간장, 참기름, 소금, 밥, 통후추
위치 매치: True
정답 위치 텍스트: 당면, 돼지고기, 표고버섯, 호박, 당근, 부추, 청고추, 홍고추, 다진파, 다진마늘, 진간장, 참기름, 소금, 밥, 통후추
검사:
질문: 콩나물밥에 들어가는 재료는 무엇인가요?
답변: 쌀, 콩나물, 쇠고기, 파, 마늘, 참기름, 깨소금, 고춧가루, 진간장, 소금, 통후추
위치 매치: True
정답 위치 텍스트: 쌀, 콩나물, 쇠고기, 파, 마늘, 참기름, 깨소금, 고춧가루, 진간장, 소금, 통후추
검사:
질문: 약식에 들어가는 재료는 무엇인가요?
답변: 찹쌀, 흑설탕, 계핏가루, 깐밤, 대추, 잣, 물엿, 식물성기름, 흰설탕, 간장, 물
위치 매치: True
정답 위치 텍스트: 찹쌀, 흑설탕, 계핏가루, 깐밤, 대추, 잣, 물엿, 식물성기름, 흰설탕, 간장, 물
검사:
질문: 호박죽에 들어가는 재료는 무엇인가요?
답변: 청동호박, 팥, 찹쌀, 물, 설탕, 소금
위치 매치: True
정답 위치 텍스트: 청동호박, 팥, 찹쌀, 물, 설탕, 소금
검사:
질문: 흑임자죽에 들어가는 재료는 무엇인가요?
답변: 쌀, 흑임자, 물, 소금, 설탕
위치 매치: True
정답 위치 텍스트:

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(args.pretrained_model_name, do_lower_case=False)

from ratsnlp.nlpbook.qa import QADataset, KorQuADV1Corpus
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

corpus = KorQuADV1Corpus()
args.overwrite_cache = True

train_dataset = QADataset(args=args, corpus=corpus, tokenizer=tokenizer, mode='train')
train_loader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=args.batch_size, collate_fn=nlpbook.data_collator)

val_dataset = QADataset(args=args, corpus=corpus, tokenizer=tokenizer, mode='val')
val_loader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=args.batch_size, collate_fn=nlpbook.data_collator)


INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
INFO:ratsnlp:Creating features from train dataset file at /content/Korpora/recipe-qa
100%|██████████| 449/449 [00:00<00:00, 71073.80it/s]
convert squad examples to features: 100%|██████████| 4490/4490 [00:04<00:00, 962.40it/s] 
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:*** Example ***
INFO:ratsnlp:question & context: [CLS] 나 ##물 ##비 ##빔 ##밥 ##

In [None]:
for i in range(50):
    item = train_dataset[i]
    start_pos = item.start_positions
    end_pos = item.end_positions

    print(f"Sample {i}: start_pos={start_pos}, end_pos={end_pos}")

    # start_pos, end_pos가 0이면 답변이 없거나 [CLS] 위치일 가능성
    if start_pos == 0 and end_pos == 0:
        # context와 answer_text가 item 안에 있으면 출력 (필요에 따라 조절)
        context = getattr(item, 'context', None)
        answer_text = getattr(item, 'answer_text', None)

        if context and answer_text:
            print(f"  Possible missing answer in context")
            print(f"  Answer: {answer_text}")
            print(f"  Context snippet: {context[:200]}...")
        else:
            print("  No answer or context info available in this item.")


Sample 0: start_pos=26, end_pos=80
Sample 1: start_pos=0, end_pos=0
  No answer or context info available in this item.
Sample 2: start_pos=24, end_pos=78
Sample 3: start_pos=25, end_pos=79
Sample 4: start_pos=0, end_pos=0
  No answer or context info available in this item.
Sample 5: start_pos=24, end_pos=78
Sample 6: start_pos=24, end_pos=78
Sample 7: start_pos=87, end_pos=107
Sample 8: start_pos=86, end_pos=106
Sample 9: start_pos=87, end_pos=107
Sample 10: start_pos=88, end_pos=108
Sample 11: start_pos=24, end_pos=44
Sample 12: start_pos=88, end_pos=108
Sample 13: start_pos=24, end_pos=44
Sample 14: start_pos=22, end_pos=36
Sample 15: start_pos=20, end_pos=34
Sample 16: start_pos=21, end_pos=35
Sample 17: start_pos=20, end_pos=34
Sample 18: start_pos=20, end_pos=34
Sample 19: start_pos=43, end_pos=94
Sample 20: start_pos=42, end_pos=93
Sample 21: start_pos=43, end_pos=94
Sample 22: start_pos=44, end_pos=95
Sample 23: start_pos=44, end_pos=95
Sample 24: start_pos=22, end_pos=70
Sampl

In [None]:
from transformers import BertForQuestionAnswering, BertConfig
from ratsnlp.nlpbook.qa import QATask

config = BertConfig.from_pretrained(args.pretrained_model_name)
model = BertForQuestionAnswering.from_pretrained(args.pretrained_model_name, config=config)

task = QATask(model, args)
trainer = nlpbook.get_trainer(args)

trainer.fit(
    task,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader,
)

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['qa_out

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

-----------

In [None]:
from ratsnlp.nlpbook.qa import QADeployArguments

args = QADeployArguments(
    pretrained_model_name="beomi/kcbert-base",
    downstream_model_dir="/gdrive/My Drive/nlpbook/checkpoint-recipe-qa",
    downstream_model_checkpoint_fpath="/gdrive/My Drive/nlpbook/checkpoint-recipe-qa/epoch=6-val_loss=0.00.ckpt",
    max_seq_length=256,
    max_query_length=32,
)


downstream_model_checkpoint_fpath: /gdrive/My Drive/nlpbook/checkpoint-recipe-qa/epoch=6-val_loss=0.00.ckpt


In [None]:
import torch
from transformers import BertConfig, BertForQuestionAnswering
fine_tuned_model_ckpt = torch.load(
    args.downstream_model_checkpoint_fpath,
    map_location=torch.device("cpu")
)
pretrained_model_config = BertConfig.from_pretrained(
    args.pretrained_model_name,
)
model = BertForQuestionAnswering(pretrained_model_config)
model.load_state_dict({k.replace("model.", ""): v for k, v in fine_tuned_model_ckpt['state_dict'].items()})
model.eval()

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(300, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elem

In [None]:
def inference_fn(question, context):
    if question and context:
        truncated_query = tokenizer.encode(
            question,
            add_special_tokens=False,
            truncation=True,
            max_length=args.max_query_length
       )
        inputs = tokenizer.encode_plus(
            text=truncated_query,
            text_pair=context,
            truncation="only_second",
            padding="max_length",
            max_length=args.max_seq_length,
            return_token_type_ids=True,
        )
        with torch.no_grad():
            outputs = model(**{k: torch.tensor([v]) for k, v in inputs.items()})
            start_pred = outputs.start_logits.argmax(dim=-1).item()
            end_pred = outputs.end_logits.argmax(dim=-1).item()
            pred_text = tokenizer.decode(inputs['input_ids'][start_pred:end_pred+1])
    else:
        pred_text = ""
    return {
        'question': question,
        'context': context,
        'answer': pred_text,
    }

In [None]:
!wget -q https://bin.equinox.io/c/bNyj1mQVY4c/ngrok-v3-stable-linux-amd64.tgz
!tar -xzf ngrok-v3-stable-linux-amd64.tgz
!chmod +x ngrok
!mv ngrok /usr/local/bin/ngrok
!ngrok version
!ngrok config add-authtoken 2z5xDrfxApZ1X3FHos9LYJn5mgc_5YqZgf85yKBd6mVmY2bZR

ngrok version 3.23.3
Authtoken saved to configuration file: /root/.ngrok2/ngrok.yml


In [None]:
'''
  ratsnlp version
'''

from ratsnlp.nlpbook.qa import get_web_service_app

app = get_web_service_app(inference_fn)

import subprocess
import time
import requests

ngrok_process = subprocess.Popen(['ngrok', 'http', '5000'])
time.sleep(3)  # ngrok 터널 생성 대기

res = requests.get('http://localhost:4040/api/tunnels')
public_url = res.json()['tunnels'][0]['public_url']
print(f'ngrok public URL: {public_url}')

app.run()


ngrok public URL: https://8def-35-184-77-40.ngrok-free.app
 * Serving Flask app 'ratsnlp.nlpbook.qa.deploy'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://8def-35-184-77-40.ngrok-free.app
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [27/Jun/2025 18:39:30] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [27/Jun/2025 18:39:31] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [27/Jun/2025 18:39:42] "POST /api HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [27/Jun/2025 18:39:57] "POST /api HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [27/Jun/2025 18:42:28] "POST /api HTTP/1.1" 200 -


In [None]:
'''
  handmade version
'''


import json
import threading
from flask import Flask, render_template_string, request, jsonify
import subprocess
import time
import requests

# 데이터 로드
print("📋 데이터를 로드하는 중...")
with open("/content/Korpora/recipe-qa/KorQuAD_v1.0_train.json", encoding="utf-8") as f:
    qa_data = json.load(f)

# 레시피 데이터 준비
recipes = []
for item in qa_data["data"]:
    title = item["title"].replace("레시피: ", "")
    context = item["paragraphs"][0]["context"]
    recipes.append({
        "title": title,
        "context": context
    })

print(f"✅ 총 {len(recipes)}개의 레시피 데이터를 로드했습니다.")

# Flask 앱 생성
app = Flask(__name__)

# HTML 템플릿
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="ko">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>🍳 레시피 Q&A</title>
    <style>
        * { margin: 0; padding: 0; box-sizing: border-box; }
        body {
            font-family: 'Segoe UI', sans-serif;
            background: #ffffff;
            min-height: 100vh;
            position: relative;
        }
        .food-bg {
            position: fixed; top: 0; left: 0; width: 100%; height: 100%;
            pointer-events: none; z-index: 1; opacity: 0.5;
        }
        .container {
            max-width: 800px; margin: 0 auto; padding: 20px;
            position: relative; z-index: 10; background: rgba(255,255,255,0.95);
            min-height: 100vh;
        }
        h1 { text-align: center; color: #333; margin-bottom: 30px; font-size: 2.2em; }
        .section {
            background: white; margin-bottom: 20px; padding: 20px;
            border-radius: 10px; box-shadow: 0 2px 8px rgba(0,0,0,0.1);
            border: 1px solid #eee;
        }
        .section h3 { color: #555; margin-bottom: 15px; font-size: 1.1em; }
        select, input, button {
            width: 100%; padding: 12px; border: 2px solid #ddd;
            border-radius: 8px; font-size: 16px; font-family: inherit;
        }
        select:focus, input:focus { outline: none; border-color: #666; }
        button {
            background: #555; color: white; border: none; cursor: pointer; margin-top: 10px;
        }
        button:hover { background: #444; }
        button:disabled { background: #ccc; cursor: not-allowed; }
        .context-box {
            background: #f9f9f9; border: 2px solid #eee; border-radius: 8px;
            padding: 15px; max-height: 200px; overflow-y: auto; font-size: 15px; color: #333;
        }
        .answer-box {
            background: #f0f8ff; border: 2px solid #ddd; border-radius: 8px;
            padding: 15px; font-size: 16px; color: #333; min-height: 60px;
        }
        .placeholder { color: #999; font-style: italic; text-align: center; padding: 20px; }
    </style>
</head>
<body>
    <div class="food-bg">
        <div style="position: absolute; top: 10%; left: 10%; font-size: 8em;">🍚</div>
        <div style="position: absolute; top: 20%; right: 15%; font-size: 10em;">🥘</div>
        <div style="position: absolute; top: 30%; left: 20%; font-size: 6em;">🍜</div>
        <div style="position: absolute; top: 40%; right: 10%; font-size: 14em;">🥗</div>
        <div style="position: absolute; top: 50%; left: 5%; font-size: 29em;">🍲</div>
        <div style="position: absolute; top: 60%; right: 20%; font-size: 19em;">🥙</div>
        <div style="position: absolute; top: 70%; left: 15%; font-size: 29em;">🍱</div>
        <div style="position: absolute; top: 80%; right: 5%; font-size: 54em;">🍛</div>
    </div>

    <div class="container">
        <h1>🍳 레시피 Q&A 시스템</h1>

        <div class="section">
            <h3>1️⃣ 레시피 선택</h3>
            <select id="recipe-select">
                <option value="">레시피를 선택하세요</option>
                {% for recipe in recipes %}
                <option value="{{ loop.index0 }}">{{ recipe.title }}</option>
                {% endfor %}
            </select>
        </div>

        <div class="section">
            <h3>2️⃣ 레시피 정보</h3>
            <div id="context-display" class="context-box">
                <div class="placeholder">위에서 레시피를 선택해주세요</div>
            </div>
        </div>

        <div class="section">
            <h3>3️⃣ 질문하기</h3>
            <input type="text" id="question-input" placeholder="레시피를 먼저 선택하세요" disabled>
            <button id="ask-btn" disabled>질문하기</button>
        </div>

        <div class="section">
            <h3>4️⃣ AI 답변</h3>
            <div id="answer-display" class="answer-box">
                <div class="placeholder">질문을 입력하고 버튼을 눌러보세요</div>
            </div>
        </div>
    </div>

    <script>
        const recipes = {{ recipes | tojson }};

        document.getElementById('recipe-select').addEventListener('change', function() {
            const index = this.value;
            const contextDiv = document.getElementById('context-display');
            const questionInput = document.getElementById('question-input');
            const askBtn = document.getElementById('ask-btn');
            const answerDiv = document.getElementById('answer-display');

            if (index !== '') {
                const recipe = recipes[parseInt(index)];
                contextDiv.innerHTML = recipe.context;
                questionInput.disabled = false;
                askBtn.disabled = false;
                questionInput.placeholder = recipe.title + '에 대해 질문하세요';
                answerDiv.innerHTML = '<div class="placeholder">질문을 입력하고 버튼을 눌러보세요</div>';
            } else {
                contextDiv.innerHTML = '<div class="placeholder">위에서 레시피를 선택해주세요</div>';
                questionInput.disabled = true;
                askBtn.disabled = true;
                questionInput.value = '';
                questionInput.placeholder = '레시피를 먼저 선택하세요';
                answerDiv.innerHTML = '<div class="placeholder">질문을 입력하고 버튼을 눌러보세요</div>';
            }
        });

        document.getElementById('ask-btn').addEventListener('click', async function() {
            const select = document.getElementById('recipe-select');
            const questionInput = document.getElementById('question-input');
            const answerDiv = document.getElementById('answer-display');

            const question = questionInput.value.trim();
            if (!question) {
                alert('질문을 입력해주세요');
                return;
            }

            const recipe = recipes[parseInt(select.value)];
            answerDiv.innerHTML = '🤔 AI가 생각하고 있습니다...';

            try {
                const response = await fetch('/ask', {
                    method: 'POST',
                    headers: { 'Content-Type': 'application/json' },
                    body: JSON.stringify({
                        question: question,
                        context: recipe.context
                    })
                });

                const result = await response.json();
                answerDiv.innerHTML = '<strong>답변:</strong> ' + result.answer + '...';
            } catch (error) {
                answerDiv.innerHTML = '오류가 발생했습니다. 다시 시도해주세요.';
            }
        });

        document.getElementById('question-input').addEventListener('keypress', function(e) {
            if (e.key === 'Enter' && !this.disabled) {
                document.getElementById('ask-btn').click();
            }
        });
    </script>
</body>
</html>
"""

@app.route('/')
def index():
    print("🌐 메인 페이지 접속")
    return render_template_string(HTML_TEMPLATE, recipes=recipes)

@app.route('/ask', methods=['POST'])
def ask():
    try:
        data = request.json
        question = str(data.get('question', '')).strip()
        context = str(data.get('context', '')).strip()

        print(f"\n🤖 질문: {question}")
        print(f"📄 지문: {len(context)}자")

        if not question or not context:
            return jsonify({'answer': '질문이나 지문이 비어있습니다.'}), 400

        # 원래 inference_fn 사용
        result = inference_fn(question, context)
        answer = result.get('answer', '').strip()

        if not answer:
            answer = '답변을 찾을 수 없습니다.'

        print(f"💡 답변: {answer}\n")
        return jsonify({'answer': answer})

    except Exception as e:
        print(f"❌ 오류: {e}")
        return jsonify({'answer': '서버 오류가 발생했습니다.'}), 500

# ngrok 설정
def run_ngrok():
    subprocess.run(['ngrok', 'http', '5000'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

def get_ngrok_url():
    try:
        response = requests.get('http://localhost:4040/api/tunnels')
        tunnels = response.json()['tunnels']
        for tunnel in tunnels:
            if tunnel['config']['addr'] == 'http://localhost:5000':
                return tunnel['public_url']
    except:
        pass
    return None

print("🚀 ngrok 터널을 시작합니다...")
ngrok_thread = threading.Thread(target=run_ngrok, daemon=True)
ngrok_thread.start()

time.sleep(5)
public_url = get_ngrok_url()

if public_url:
    print(f"✅ 웹서비스 시작!")
    print(f"🌐 URL: {public_url}")
else:
    print("❌ ngrok URL 확인 불가")

print("\n" + "="*40)
print("🍳 사용법:")
print("1. 레시피 선택")
print("2. 질문 입력")
print("3. 질문하기 클릭")
print("="*40 + "\n")

app.run(host='0.0.0.0', port=5000, debug=False)