# fine-tuning

In [None]:
# !export OPENAI_API_KEY="your_api_key"
# !openai api files.create -f preprocessing/gap/gap-development_finetune.jsonl -p fine-tune
# !openai api files.list #업로드된 파일

In [None]:
from openai import OpenAI
api_key="your_api_key"
client = OpenAI(api_key=api_key)
jobs = client.fine_tuning.jobs.list(limit=10)
for job in jobs:
    print(job.id, job.status)  # Job ID와 상태 출력

In [None]:
# !openai api files.list
# response = client.files.delete("file-ID") #업로드된 파일 지우기
# print(response)

In [None]:
# ✅ Fine-tuning 작업 생성 함수
def create_fine_tuning_job(training_file_id, validation_file_id, model_name, n_epochs):
    response = client.fine_tuning.jobs.create(
        training_file=training_file_id,
        validation_file=validation_file_id,
        model=model_name,
        hyperparameters={"n_epochs": n_epochs})
    print("🔥 Fine-tuning 작업 생성 완료:", response.id)
    return response

# ✅ Fine-tuning 작업 상태 조회 함수
def get_fine_tuning_status(job_id):
    job = client.fine_tuning.jobs.retrieve(job_id)
    if job.fine_tuned_model:
        print("🎉 Fine-tuned 모델 ID:", job.fine_tuned_model)  # 모델 ID
    else:
        print("⏳ 아직 완료되지 않음. 현재 상태:", job.status)
    return job

학습코드

In [None]:
training_file_id = "file-WtPHNZMAsF6ejUA6vGMjfT" # gap-dev_npe_finetune.jsonl
validation_file_id = "file-BtBFBVctPbQHzjtqYmEVzh" #gap-validation_finetune.jsonl
model_name="gpt-4o-mini-2024-07-18"
n_epochs=10
job_response = create_fine_tuning_job(training_file_id, validation_file_id,model_name, n_epochs) #요금 청구됨
job_id = job_response.id 


In [None]:
# job_id="ftjob-qt49bcqfHI6RyUJijj6oZUjN"
# status = get_fine_tuning_status(job_id)

(학습완료) train: gap-development_finetune(file-SmkAaMsznDukRteEbgmhXb) / val: gap-validation_finetune(file-BtBFBVctPbQHzjtqYmEVzh)
    epochs=10 /job_id ftjob-Z0GkBZJgNcwHLwzpWbSYpcEF / ft:gpt-4o-mini-2024-07-18:skku::BAXp8p7n


(학습완료) train: gap-dev_one_finetune(file-SxrjhFV1QbQr79CitTrXye) / val: gap-validation_finetune
    epochs=10 / job_id ftjob-u7xkEgw7DOopYjrWM4JzABIi / ft:gpt-4o-mini-2024-07-18:skku::BAa7KB6E


(학습완료) train: gap-dev_npe_finetune(file-WtPHNZMAsF6ejUA6vGMjfT) / val: gap-validation_finetune
    epochs=10 / job_id ftjob-qt49bcqfHI6RyUJijj6oZUjN / ft:gpt-4o-mini-2024-07-18:skku::BAbIEGYf

# inference

In [None]:
from openai import OpenAI
import json
import pandas as pd
import os
import time

api_key="your_api_key"
client = OpenAI(api_key=api_key)

fine tuning모델 적용시

In [None]:
# model_id = "ft:gpt-4o-mini-2024-07-18:skku::BAa7KB6E" #fine_tuned_model_id 
model_id= "gpt-4o-mini"

In [None]:
# ✅ Fine-tuned 모델 API 호출 함수
def query_gpt_mini(prompt):
    response = client.chat.completions.create(
        model=model_id,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


# ✅ API 호출 및 재시도 함수
def query_with_retry(prompt, max_retries=5, wait_time=5):
    retries = 0
    while retries < max_retries:
        try:
            response = query_gpt_mini(prompt)
            if response.strip().upper() != "API FAILED":
                return response  # 정상 응답 반환
        except Exception as e:
            print(f"❌ API 호출 실패: {e}")
            if "rate_limit_exceeded" in str(e):
                print(f"⚠️ [RateLimitError] 요청 제한 초과. {wait_time}초 대기 후 재시도... ({retries+1}/{max_retries})")
                #time.sleep(wait_time)
            else:
                break  # 다른 에러 시 중단
        retries += 1
    return "API FAILED"


In [None]:
name_list = ["wsc"]  # 사용할 데이터셋 이름

# ✅ 메인 실행
for name in name_list:
    json_file_path = os.path.join(os.getcwd(), "preprocessing", "test","wsc", f"{name}.json")
    csv_file_path = os.path.join(os.getcwd(), "output", "zero", "GPT-zero", f"{name}.csv")  # 저장 파일명 수정

    # JSON 데이터 로드
    with open(json_file_path, "r", encoding="utf-8") as json_file:
        test_data = json.load(json_file)

    # 이전에 저장된 데이터 불러오기 (중복 처리 방지)
    if os.path.exists(csv_file_path):
        df_existing = pd.read_csv(csv_file_path, encoding="utf-8")
        processed_ids = set(df_existing["text_id"].tolist())
        print(f"🔄 기존 데이터 {len(processed_ids)}개 로드 완료. 이어서 진행.")
    else:
        df_existing = pd.DataFrame()
        processed_ids = set()

    # 결과 저장 리스트
    results = []

    for data in test_data:
        if data["text_id"] in processed_ids:
            continue  # 이미 처리된 경우 건너뜀

        # ✅ 프롬프트 생성
        prompt = f'''Question: In the sentence "{data["text"]}", what does "{data["target"]}" refer to?
Options:
(A) {data["options"]["A"]}
(B) {data["options"]["B"]}

Answer only with "A" if (A) is correct, "B" if (B) is correct, or "Neither" if none of them are correct. Do not provide explanations.
Answer:'''

        # ✅ wsc 프롬프트
#         prompt = f'''Question: In the sentence "{data["text"]}", what should replace "{data["target"]}"?
# Options:
# (A) {data["options"]["A"]}
# (B) {data["options"]["B"]}
# Answer only with "A" if (A) is correct, "B" if (B) is correct, or "Neither" if none of them are correct. Do not provide explanations.
# Answer:'''


        # ✅ API 호출 (재시도 포함)
        while True:
            gpt_response = query_with_retry(prompt)
            if gpt_response.strip().upper() != "API FAILED":
                break  # 정상 응답이면 탈출
            print(f"⚠️ [API FAILED] text_id: {data['text_id']} - 20초 후 재시도")
            #time.sleep(10)

        # ✅ 정답 비교
        correct = (gpt_response.strip().upper() == data["answer"].strip().upper())

        # ✅ 결과 저장
        result = {
            "text_id": data["text_id"],
            "text": data["text"],
            "target": data["target"],
            "expected_answer": data["answer"].strip().upper(),
            "gpt_answer": gpt_response.strip().upper(),
            "correct": correct
        }
        results.append(result)

        # ✅ 콘솔 출력
        print(f"[{name}] text_id: {data['text_id']}, gpt_answer: {gpt_response.strip().upper()}, True answer: {data['answer'].strip().upper()}")

        # ✅ 실시간 CSV 저장 (중단 대비)
        df_temp = pd.DataFrame([result])
        df_temp.to_csv(csv_file_path, mode="a", index=False, header=not os.path.exists(csv_file_path), encoding="utf-8")

        # 속도 조절
        #time.sleep(5)  # 요청 제한 방지 (조정 가능)

    print(f"✅ [{name}] 모든 데이터 처리 완료: {csv_file_path}")