# preparing dataset

In [None]:
import pandas as pd

df = pd.read_csv('./s_data.csv')
df

In [None]:
df.drop(columns=['dialect_list','standard_list'],inplace=True)

In [None]:
num = int(input('하고 싶은 만큼'))
df=df.sample(n=num,random_state=13)

# use model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import torch

base_model = 'beomi/Llama-3-Open-ko-8B-Instruct-preview'
adapter_path = './model/model1_0724_ver1/checkpoint-2700'  # 원하는 체크 포인트 경로


if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False
)
base = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    
    attn_implementation="eager",
    trust_remote_code=True
)

model = PeftModel.from_pretrained(base, adapter_path)
model.eval()


tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token


In [None]:
def translate_to_jejueo(sentence):
    prompt = f"다음 문장을 제주 방언으로 번역해줘.\n{sentence}\n=> "
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.7,
            top_p=0.85,
            do_sample=True,
            repetition_penalty=1.3,
            pad_token_id=tokenizer.eos_token_id
        )

    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "=>" in result:
        result = result.split("=>")[-1].strip()
    return result

df['제주방언'] = df['standard_form'].apply(translate_to_jejueo)

df

In [None]:
df['제주방언'] = df['제주방언'].str.split('<').str[0].tolist()

# BLEU & Bert_score

In [None]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

smoothie = SmoothingFunction().method4
bleu_scores = []
exact_matches = []

for _, row in df.iterrows():
    reference = [row['dialect_form'].split()]
    candidate = row['제주방언'].split()
    bleu = sentence_bleu(reference, candidate, smoothing_function=smoothie)
    bleu_scores.append(bleu)
    exact_matches.append(row['dialect_form'] == row['제주방언'])

df['BLEU'] = bleu_scores
df['정확히_일치'] = exact_matches

print(f"평균 BLEU: {sum(bleu_scores)/len(bleu_scores):.4f}")
print(f"정답률 (정확히 일치): {sum(exact_matches)/len(exact_matches):.2%}")

In [None]:
from bert_score import score

candidates = df['제주방언'].tolist()
references = df['dialect_form'].tolist()

P, R, F1 = score(
    candidates,
    references,
    lang='ko',  
    model_type='xlm-roberta-base'  
)

df['BERTScore_F1'] = F1.tolist()
print(f"평균 BERTScore F1: {F1.mean().item():.4f}")

In [None]:
# df.to_excel('score_1000.xlsx',index=False)