# 0. Setup

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Datastore的資料位置，測試資料在 Main 裡面
path = {
    "chinese_datastore": "馬蘭阿美族語 那些詞的整句(包括翻譯).json", # 那些詞的整句(包括翻譯).json
    "words_datastore": "new馬蘭阿美語  每個詞的翻譯.json" # 每個詞的翻譯.json
}

# 1. Methods

## i. Settings

### Packages

In [None]:
%pip install -U torch openai==0.28.1 nltk transformers faiss-cpu sentence-transformers

Collecting torch
  Downloading torch-2.2.1-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m755.5/755.5 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai==0.28.1
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting transformers
  Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m89.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━

### Datastore

In [None]:
import json
import string

all_ch2amis = {value: key for key, value in json.load(open(path["chinese_datastore"], "r")).items()}
words = list(json.load(open(path["words_datastore"], "r")).items())
word_list = list((w[1] for w in words))

## ii. Find knn examples

### Setup BERT model

In [None]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load the BERT model
model_name = 'DMetaSoul/sbert-chinese-general-v2'
#model_name = 'bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained('DMetaSoul/sbert-chinese-general-v2')
model = AutoModel.from_pretrained('DMetaSoul/sbert-chinese-general-v2')

### 建立faiss資料集

In [None]:
import jieba
import torch
import faiss
import numpy as np

In [None]:
# 把句子轉換為嵌入向量
def get_single_embedding(sentence): # input type: string
  output = model(**tokenizer(sentence, return_tensors='pt', truncation=True, padding=True))
  embedding = output.last_hidden_state.mean(dim=1).detach().numpy()[0]
  return embedding

# 批量轉換
def get_multiple_embeddings(sentences): # input type: list
  embeddings = {}
  for sentence in sentences:
    embeddings[sentence] = get_single_embedding(sentence)
  return embeddings # return type: dict

In [None]:
# 將嵌入向量的list轉換為Faiss索引
def embeddings2faiss(embeddings): # input type: list

  # 將嵌入向量轉換為Numpy數組
  np_embeddings = np.vstack(embeddings)

  #獲取嵌入維度並創建faiss索引
  faiss_index = faiss.IndexFlatL2(len(np_embeddings[0]))

  # 将数据添加到Faiss索引
  faiss_index.add(np_embeddings)

  return faiss_index

In [None]:
#建立datastore的embeddings data
#這裡還不轉換為faiss index是因為後續[抽資料]跟[錯誤反思]會需要從datastore中刪資料，faiss index為list形式，較難於之後處裡
#embeddings2faiss速度很快不用怕
all_datastore_embeddings = get_multiple_embeddings(list(all_ch2amis.keys()))

In [None]:
# 建立words的faiss index data
word_embeddings = []
for word in word_list:
  word_embeddings.append(get_single_embedding(word))
index_word = embeddings2faiss(word_embeddings)

### Knn functions

In [None]:
# 把中文的攤開來
word_dic = {} # {1 vec index: 2 vec index}
# one_vec_word_list = []
index_cnt = 0
for e1, w1 in enumerate(word_list):
  for e2, w2 in enumerate(w1):
    # one_vec_word_list.append(w2)
    word_dic[index_cnt] = [e1, e2] # e1: 原本list的第幾個, e2: list內的第幾個詞語
    index_cnt+=1

In [None]:
def find_similar(find_word):

    find_word_embeddings = get_single_embedding(find_word)

    distances, indices = index_word.search(np.array([find_word_embeddings]), k=1)

    matching_key = next(key for key, value in word_dic.items() if value[0] == indices[0])

    return  word_dic[int(matching_key)] # index

In [None]:
# 尋找最長相同子字串
def find_longest_same_voc(sentance):
    temp_list = []
    temp_len_list = []
    ans_list = []
    for e, w in enumerate(word_list):
        for e2,  w2 in enumerate(w):
            if len(sentance) >= len(w2):
                if w2 == sentance[0:len(w2)]:
                    # print("len(w2):"+str(len(w2)))
                    temp_list.append([e, e2])
                    # print("len(w2):"+str([e, e2]))
                    temp_len_list.append(len(w2))
    if len(temp_len_list) != 0:
        max_len = max(temp_len_list)
        # print("max_len："+ str(max_len))
        for e, i in enumerate(temp_len_list):
            if i == max_len:
                ans_list.append(temp_list[e])

    return ans_list


In [None]:
def check_not_found_sentance(sentence):
    # print("check_not_found_sentance:", sentence)
    if len(sentence) == 0:
        return ""
    ans = ""
    seg_list = jieba.cut(sentence, cut_all=False)
    for text in list(seg_list):
      if text != ' ':
        e = find_similar(text)
        ans += "[*zh]: " + text + "\n" + "[zh]: " + word_list[e[0]][e[1]] + "\n" + "[amis]: " + words[e[0]][0] + "\n\n"

        # ans += "[*zh]: " + text + "[zh]: " + word_list[e[0]][e[1]] +  ", " + "[amis]: " + words[e[0]][0]
    return ans


In [None]:
# 主func
def trans(sentance):
    cant_find_sentance = ""
    ans = ""
    while len(sentance) != 0:
        index_list = find_longest_same_voc(sentance)
        if len(index_list) != 0:
            ans += check_not_found_sentance(cant_find_sentance)
            cant_find_sentance = ""
            # print(index_list)

            if index_list[0][1] == -1:
                sentance = sentance[len(word_list[index_list[0][0]]):]
            else:
                sentance = sentance[len(word_list[index_list[0][0]][index_list[0][1]]):]
            for index in index_list:

                if index[1] == -1:
                    ans += "[zh]: " + word_list[index[0]] + "\n" + "[amis]: " + words[index[0]][0] + "\n\n"
                else:
                    ans += "[zh]: " + word_list[index[0]][index[1]] + "\n" + "[amis]: " + words[index[0]][0] + "\n\n"
                # if index[1] == -1:
                #     ans += "[zh]: " + word_list[index[0]] + "[amis]: " + words[index[0]][0]
                # else:
                #     ans += "[zh]: " + word_list[index[0]][index[1]] + "[amis]: " + words[index[0]][0]

        else:
            cant_find_sentance += sentance[0]
            sentance = sentance[1:]
    ans += check_not_found_sentance(cant_find_sentance)
    return ans
# print(trans("野地裡有花"))

In [None]:
def find_knn_examples_topN_sentence(sentence, datastore_embeddings, k):

  cp_datastore_embeddings = dict(datastore_embeddings)
  if sentence in cp_datastore_embeddings.keys():
    del cp_datastore_embeddings[sentence]

  datastore_index = embeddings2faiss(list(cp_datastore_embeddings.values()))

  examples = []
  distances, indices = datastore_index.search(np.array([get_single_embedding(sentence)]), k)

  for i, index in enumerate(indices[0]):
    zh_example = list(cp_datastore_embeddings.keys())[index]
    examples.append([zh_example, all_ch2amis[zh_example]])

  return examples

In [None]:
import re

def find_knn_examples(sentence, datastore_embeddings, k, findwords=True):

  examples = ""
  for i in find_knn_examples_topN_sentence(sentence, datastore_embeddings, k):
    examples += "[zh]:" + i[0] + "\n"
    examples += "[amis]:" + i[1] + "\n\n"


  if findwords:
    # 去除標點符號
    translator = str.maketrans("，。！", "   ", string.punctuation)
    tr_sentence = sentence.translate(translator)

    # 去除英文字
    sentence_without_english = re.sub(r'[a-zA-Z]+', '', tr_sentence)

    # 查中文詞表
    examples += trans(sentence_without_english)

    # 英文照翻不變
    get_english = re.findall(r'[a-zA-Z]+', tr_sentence)
    for word in get_english:
      examples += "[zh]:"+word+"\n"
      examples += "[amis]:"+word+"\n\n"

  # print(examples)

  return examples

## iii. Translation

### Openai prompting

In [None]:
%pip install request

[31mERROR: Could not find a version that satisfies the requirement request (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for request[0m[31m
[0m

In [None]:
# import requests
# import json
# from transformers import AutoTokenizer

# url = ""
# #tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v0.1")

# def Breeze_st(messages):
#     prompt = tokenizer.apply_chat_template(messages, tokenize=False)

#     payload = json.dumps({
#         "inputs": prompt,
#         "parameters": {
#             "do_sample": True,
#             "temperature": 0.01,
#             "top_p": 0.95
#         }
#     })
#     headers = {
#         'Content-Type': 'application/json',
#         'accept': 'application/json'
#     }

#     response = requests.request("POST", url, headers=headers, data=payload)

#     return response.text

# def translate_ch2amis(sentence, datastore_embeddings, knn_k=10, cot_num=2, findwords=True):
#     examples = find_knn_examples(sentence, datastore_embeddings, knn_k, findwords)
#     cot_examples = find_knn_examples_topN_sentence(sentence, datastore_embeddings, cot_num)

#     messages = []
#     for i in cot_examples:
#         messages.append({"role": "user", "content":"You are an amis language translator. The followings some [zh] to [amis] examples. \n" + find_knn_examples(i[0], datastore_embeddings, knn_k, findwords) + "If you see [*zh], it means that I couldn't find it in my dictionary, so I have identified the term closest in meaning. Please help me determine whether it has the same meaning as the next [zh] (the one I identified as the closest). If it does, please refer to it , If they are not similar, ignore it. If you see English, it means a proper noun, Please ignore what it brings back , and please put it directly into the translated sentence when translating, don't ignore it. cloud you help to translate[zh]:" + i[0]})
#         messages.append({"role": "assistant", "content": i[1]})

#     messages.append({"role": "user", "content":"You are an amis language translator. The followings some [zh] to [amis] examples. \n" + examples + "If you see [*zh], it means that I couldn't find it in my dictionary, so I have identified the term closest in meaning. Please help me determine whether it has the same meaning as the next [zh] (the one I identified as the closest). If it does, please refer to it , If they are not similar, ignore it. cloud you help to translate[zh]:" + sentence})

#     return Breeze_st(messages)


In [None]:
# from transformers import AutoTokenizer
# #tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v0.1")
# chat = [
#   {"role": "user", "content": "你好，請問你可以完成什麼任務？"},
#   {"role": "assistant", "content": "你好，我可以幫助您解決各種問題、提供資訊和協助您完成許多不同的任務。例如：回答技術問題、提供建議、翻譯文字、尋找資料或協助您安排行程等。請告訴我如何能幫助您。"},
#   {"role": "user", "content": "太棒了！"},
# ]
# tokenizer.apply_chat_template(chat, tokenize=False)

'<s>You are a helpful AI assistant built by MediaTek Research. The user you are helping speaks Traditional Chinese and comes from Taiwan.   [INST] 你好，請問你可以完成什麼任務？ [/INST] 你好，我可以幫助您解決各種問題、提供資訊和協助您完成許多不同的任務。例如：回答技術問題、提供建議、翻譯文字、尋找資料或協助您安排行程等。請告訴我如何能幫助您。 [INST] 太棒了！ [/INST] '

In [None]:
# import requests
# import json
# from transformers import AutoTokenizer

# url = ""

# tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v0.1")

# def Breeze_st(messages):
#     prompt = tokenizer.apply_chat_template(messages, tokenize=False)

#     payload = json.dumps({
#         "inputs": prompt,
#         "parameters": {
#             "do_sample": True,
#             "temperature": 0.01,
#             "top_p": 0.95
#         }
#     })
#     headers = {
#         'Content-Type': 'application/json',
#         'accept': 'application/json'
#     }

#     response = requests.request("POST", url, headers=headers, data=payload)

#     return_text = response.text

#     return return_text

# def translate_ch2amis(sentence, datastore_embeddings, knn_k=10, cot_num=2, findwords=True):
#     # Get knn examples
#     examples = find_knn_examples(sentence, datastore_embeddings, knn_k, findwords)
#     cot_examples = find_knn_examples_topN_sentence(sentence, datastore_embeddings, cot_num)

#     messages = []
#     for i in cot_examples:
#         messages.append({"role": "user", "content":"You are an amis language translator. The followings some [zh] to [amis] examples. \n" + find_knn_examples(i[0], datastore_embeddings, knn_k, findwords) + "If you see [*zh], it means that I couldn't find it in my dictionary, so I have identified the term closest in meaning. Please help me determine whether it has the same meaning as the next [zh] (the one I identified as the closest). If it does, please refer to it , If they are not similar, ignore it. If you see English, it means a proper noun, Please ignore what it brings back , and please put it directly into the translated sentence when translating, don't ignore it. cloud you help to translate[zh]:" + i[0]})
#         messages.append({"role": "assistant", "content": i[1]})

#     messages.append({"role": "user", "content":"You are an amis language translator. The followings some [zh] to [amis] examples. \n" + examples + "If you see [*zh], it means that I couldn't find it in my dictionary, so I have identified the term closest in meaning. Please help me determine whether it has the same meaning as the next [zh] (the one I identified as the closest). If it does, please refer to it , If they are not similar, ignore it. cloud you help to translate[zh]:" + sentence})

#     return Breeze_st(messages)


In [None]:
import openai
openai.api_key = ""

In [None]:
import time
def GPT_st(messages):
  # try:
    # Get response from openai
    response = openai.ChatCompletion.create(
      # model="gpt-4-1106-preview",
      model="gpt-3.5-turbo-0301",
      messages = messages,
      max_tokens = 512,
      temperature = 0
    )['choices'][0]['message']['content']
    return response

In [None]:
def translate_ch2amis(sentence, datastore_embeddings, knn_k=10, cot_num=2, findwords=True):
  # Get knn examples
  examples = find_knn_examples(sentence, datastore_embeddings, knn_k, findwords)
  cot_examples = find_knn_examples_topN_sentence(sentence, datastore_embeddings, cot_num)

  messages = []
  for i in cot_examples:
    messages.append({"role": "user", "content":"You are an amis language translator. The followings some [zh] to [amis] examples. \n" + find_knn_examples(i[0], datastore_embeddings, knn_k, findwords) + "If you see [*zh], it means that I couldn't find it in my dictionary, so I have identified the term closest in meaning. Please help me determine whether it has the same meaning as the next [zh] (the one I identified as the closest). If it does, please refer to it , If they are not similar, ignore it. If you see English, it means a proper noun, Please ignore what it brings back , and please put it directly into the translated sentence when translating, don't ignore it. cloud you help to translate[zh]:" + i[0]})
    messages.append({"role": "assistant", "content": i[1]})

  messages.append({"role": "user", "content":"You are an amis language translator. The followings some [zh] to [amis] examples. \n" + examples + "If you see [*zh], it means that I couldn't find it in my dictionary, so I have identified the term closest in meaning. Please help me determine whether it has the same meaning as the next [zh] (the one I identified as the closest). If it does, please refer to it , If they are not similar, ignore it. cloud you help to translate[zh]:" + sentence})

  # print(examples)
  return GPT_st(messages)

# translate_ch2amis("我是Sawmah。", all_datastore_embeddings)

In [None]:
print(len(all_datastore_embeddings))

380


In [None]:
print(all_datastore_embeddings)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Batch translate

In [None]:
# 將原始中文資料翻譯成南勢阿美族語
def translate_to_amei(chinese_data, datastore_embeddings, knn_k=10, cot_num=2, findwords=True):
  result = {}
  cnt = 1
  for sentence in chinese_data:
    print(str(cnt)+".")
    print(sentence)
    response = translate_ch2amis(sentence, datastore_embeddings, knn_k, cot_num, findwords)
    # translated_sentence = response[response.find("[amis]:")+7:].strip()

    translated_sentence = response
    print(translated_sentence + '\n')
    result[sentence] = translated_sentence
    cnt += 1
  return result

## iv. Learn from mistakes

### Create data

In [None]:
import random
def create_LFM_data(mode, t_data):
  if mode == 0:
    data = t_data
  else:
    data = {}
    for key,value in t_data.items():
      temp = key.split()
      random.shuffle(temp)
      data[value] = " ".join(temp)
  return data

### Find similar in knn and translate it (as wrong answer)

In [None]:
def create_wrong_set(mode, topN_sentence, datastore_embeddings):
  wr = []

  if mode==0:
    for i in topN_sentence:
      #翻譯 (knn func 已排除找到原句的狀況)
      response = translate_ch2amis(i[0], datastore_embeddings)
      wr.append([i[0], response, i[1]])
  else:
    for i in topN_sentence:
      temp = i[1].split()
      random.shuffle(temp)
      response = " ".join(temp)
      # print(response)
      wr.append([i[0], response, i[1]])

  return wr

In [None]:
def find_wrong_example(mode, sentence, datastore_embeddings, k=2):

  # 找到與目標sentence最相似的k句話(預設k=2)
  topN_sentence = find_knn_examples_topN_sentence(sentence, datastore_embeddings, k)
  # print(topN_sentence)

  return create_wrong_set(mode, topN_sentence, datastore_embeddings)

### Error checking func

In [None]:
# import requests
# import json
# from transformers import AutoTokenizer

# url = ""

# #tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v0.1")

# def Breeze_error_checking(messages, cnt):
#     prompt = tokenizer.apply_chat_template(messages, tokenize=False)

#     payload = json.dumps({
#         "inputs": prompt,
#         "parameters": {
#             "do_sample": True,
#             "temperature": 0.01,
#             "top_p": 0.95
#         }
#     })
#     headers = {
#         'Content-Type': 'application/json',
#         'accept': 'application/json'
#     }

#     response = requests.request("POST", url, headers=headers, data=payload)
#     return_text = response.text

#     if return_text.find('\n') != -1 and cnt < 10:
#         print("[re-generating...]")
#         cnt += 1
#         return Breeze_error_checking(messages, cnt)
#     else:
#         return return_text

# def error_checking(sentence, amis, wrong_set, pf, datastore_embeddings, k, findwords):
#     hint = ""
#     for i in wrong_set:
#         if not hint:
#             hint += "[zh]:" + i[0] + " [your answer]:" + i[1] + " [correct answer]:" + i[2]
#         else:
#             hint += " ; [zh]:" + i[0] + " [your answer]:" + i[1] + " [correct answer]:" + i[2]

#     examples = find_knn_examples(sentence, datastore_embeddings, k, findwords)

#     messages = []
#     for i in pf:
#         messages.append({"role": "user", "content": "Confirm whether the following sentence needs revision: [zh]:"+ i[0] + " [your answer]:" + i[1] + " \n" + "Just answer me the [Revision]. Just answer me the [Revision]. Just answer me the [Revision]."})
#         messages.append({"role": "assistant", "content": i[2]})

#     messages.append({"role": "system", "content": "Here are several sets of results you translated before: " + hint + " \n" + \
#               "Please analyze the differences between [your answer] and [Revision] result in contexts. " + \
#               "Mainly focusing on the arrangement of words in sentences, learning their structure and grammar. Just change their order. Just change their order. Just change their order.\n"})

#     messages.append({"role": "user", "content": "These are some tips of generating your case: " + examples + "\n" + \
#             "Confirm whether the following sentence needs revision: [zh]:"+ sentence + "[your answer]:" + amis + "\n" + \
#             "Just answer me the [Revision]. Just answer me the [Revision]. Just answer me the [Revision]."})

#     cnt = 0
#     response = Breeze_error_checking(messages, cnt)
#     print(response)


In [None]:
# import requests
# import json
# from transformers import AutoTokenizer

# url = ""

# tokenizer = AutoTokenizer.from_pretrained("MediaTek-Research/Breeze-7B-Instruct-v0.1")

# def Breeze_error_checking(messages, cnt):
#     prompt = tokenizer.apply_chat_template(messages, tokenize=False)

#     payload = json.dumps({
#         "inputs": prompt,
#         "parameters": {
#             "do_sample": True,
#             "temperature": 0.01,
#             "top_p": 0.95
#         }
#     })
#     headers = {
#         'Content-Type': 'application/json',
#         'accept': 'application/json'
#     }

#     response = requests.request("POST", url, headers=headers, data=payload)
#     return_text = response.text

#     if return_text.find('\n') != -1 and cnt < 10:
#         print("[re-generating...]")
#         cnt += 1
#         return Breeze_error_checking(messages, cnt)
#     else:
#         return return_text

# def error_checking(sentence, amis, wrong_set, pf, datastore_embeddings, k, findwords):
#     hint = ""
#     for i in wrong_set:
#         if not hint:
#             hint += "[zh]:" + i[0] + " [your answer]:" + i[1] + " [correct answer]:" + i[2]
#         else:
#             hint += " ; [zh]:" + i[0] + " [your answer]:" + i[1] + " [correct answer]:" + i[2]

#     examples = find_knn_examples(sentence, datastore_embeddings, k, findwords)

#     messages = []
#     for i in pf:
#         messages.append({"role": "user", "content": "Confirm whether the following sentence needs revision: [zh]:"+ i[0] + " [your answer]:" + i[1] + " \n" + "Just answer me the [Revision]. Just answer me the [Revision]. Just answer me the [Revision]."})
#         messages.append({"role": "assistant", "content": i[2]})

#     messages.append({"role": "system", "content": "Here are several sets of results you translated before: " + hint + " \n" + \
#               "Please analyze the differences between [your answer] and [Revision] result in contexts. " + \
#               "Mainly focusing on the arrangement of words in sentences, learning their structure and grammar. Just change their order. Just change their order. Just change their order.\n"})

#     messages.append({"role": "user", "content": "These are some tips of generating your case: " + examples + "\n" + \
#             "Confirm whether the following sentence needs revision: [zh]:"+ sentence + "[your answer]:" + amis + "\n" + \
#             "Just answer me the [Revision]. Just answer me the [Revision]. Just answer me the [Revision]."})

#     cnt = 0
#     return Breeze_error_checking(messages, cnt)


In [None]:
# def GPT_error_checking(messages, cnt):
#   try:
#     response = openai.ChatCompletion.create(
#     # model="gpt-3.5-turbo-16k-0613",
#     model="gpt-4-1106-preview",
#     messages = messages,
#     max_tokens = 512,
#     temperature = 0
#     )['choices'][0]['message']['content']
#     if response.find('\n') != -1 and cnt < 10:
#       #生成不只一行的話，代表除了Revision外，還生了其他東西
#       #給他10次機會簡單的過濾，不然就直接輸出
#       #當然只有一行也可能有其他東西，生完還是要人工審核跟改正一下
#       print("[re-generating...]")
#       cnt += 1
#       return GPT_error_checking(messages, cnt)
#     else:
#       return response
#   except:
#      time.sleep(0.5)
#      return GPT_error_checking(messages, cnt)

In [None]:
# import time
# def error_checking(sentence, amis, wrong_set, pf, datastore_embeddings, k, findwords):

#   hint = ""
#   for i in wrong_set:
#     if not hint:
#       hint += "[zh]:" + i[0] + " [your answer]:" + i[1] + " [correct answer]:" + i[2]
#     else:
#       hint += " ; [zh]:" + i[0] + " [your answer]:" + i[1] + " [correct answer]:" + i[2]
#   # print(hint)

#   examples = find_knn_examples(sentence, datastore_embeddings, k, findwords)

#   messages = []
#   for i in pf:
#     messages.append({"role": "user", "content": "Confirm whether the following sentence needs revision: [zh]:"+ i[0] + " [your answer]:" + i[1] + " \n" + "Just answer me the [Revision]. Just answer me the [Revision]. Just answer me the [Revision]."})
#     messages.append({"role": "assistant", "content": i[2]})

#   messages.append({"role": "system", "content": "Here are several sets of results you translated before: " + hint + " \n" + \
#               "Please analyze the differences between [your answer] and [Revision] result in contexts. " + \
#               "Mainly focusing on the arrangement of words in sentences, learning their structure and grammar. Just change their order. Just change their order. Just change their order.\n"})

#   messages.append({"role": "user", "content": "These are some tips of generating your case: " + examples + "\n" + \
#             "Confirm whether the following sentence needs revision: [zh]:"+ sentence + "[your answer]:" + amis + "\n" + \
#             "Just answer me the [Revision]. Just answer me the [Revision]. Just answer me the [Revision]."})

#   cnt = 0
#   return GPT_error_checking(messages, cnt)

### Batch error checking

In [None]:
def batch_error_checking(mode, translated_data, pf, datastore_embeddings, lfm_k, knn_k, findwords=True):
    result = {}
    cnt = 1
    for sentence in translated_data.keys():
      print(str(cnt)+".")
      print(sentence)
      wrong_set = find_wrong_example(mode, sentence, datastore_embeddings, lfm_k)
      print(wrong_set)
      response = translated_data[sentence]
      print(response)
      revision = error_checking(sentence, response, wrong_set, pf, datastore_embeddings, knn_k, findwords)

      translated_sentence = revision

      print(translated_sentence + "\n")
      result[sentence] = translated_sentence
      cnt += 1
    return result

## v. Scoring

In [None]:
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', punctuation))

def replace_quotes(obj):
    if isinstance(obj, dict):
        for key, value in obj.items():
            obj[key] = replace_quotes(value)
    elif isinstance(obj, list):
        for i in range(len(obj)):
            obj[i] = replace_quotes(obj[i])
    elif isinstance(obj, str):
        obj = obj.replace("'", "’")
    return obj

In [None]:
import json
from nltk.translate.bleu_score import sentence_bleu
from string import punctuation

def bleu_scoring(translated_data, chinese_data, scoring_result=0):
  # bleu_scores = []
  bleu_scores_results = []  # List to store the results for saving in JSON format


  bleu_scores = []
  combined_data = []
  for ref, hyp in zip(chinese_data, translated_data.values()):
    hyp = replace_quotes(hyp)
    print(hyp)

    reference = [remove_punctuation(ref).lower().split()]  # Remove punctuation and convert to lowercase
    hypothesis = remove_punctuation(hyp).lower().split()   # Remove punctuation and convert to lowercase

    # print(reference)
    # print(hypothesis)

    bleu_score = sentence_bleu(reference, hypothesis)
    bleu_scores.append(bleu_score)

    # Append the results to the list
    if not scoring_result:
      combined_data.append({
          "參考句子": ref,
          "翻譯句子": hyp,
          "BLEU-1": sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0)),
          "BLEU-2": sentence_bleu(reference, hypothesis, weights=(0.5, 0.5, 0, 0)),
          "BLEU-3": sentence_bleu(reference, hypothesis, weights=(0.33, 0.33, 0.33, 0)),
          "BLEU-4": sentence_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25)),
          "BLEU 分數": bleu_score
      })
    else:
      combined_data.append({
          "參考句子": ref,
          "翻譯句子": scoring_result[len(combined_data)]["翻譯句子"],
          "錯誤反思": hyp,
          "BLEU-1(原,LFM)": [scoring_result[len(combined_data)]["BLEU-1"], sentence_bleu(reference, hypothesis, weights=(1, 0, 0, 0))],
          "BLEU-2(原,LFM)": [scoring_result[len(combined_data)]["BLEU-2"], sentence_bleu(reference, hypothesis, weights=(0.5, 0.5, 0, 0))],
          "BLEU-3(原,LFM)": [scoring_result[len(combined_data)]["BLEU-3"], sentence_bleu(reference, hypothesis, weights=(0.33, 0.33, 0.33, 0))],
          "BLEU-4(原,LFM)": [scoring_result[len(combined_data)]["BLEU-4"], sentence_bleu(reference, hypothesis, weights=(0.25, 0.25, 0.25, 0.25))],
          "BLEU 分數": bleu_score
      })

  # Calculate the average BLEU scores
  total_bleu_1 = 0
  total_bleu_2 = 0
  total_bleu_3 = 0
  total_bleu_4 = 0
  count = len(combined_data)
  print(count)
  for entry in combined_data:
    if not scoring_result:
      total_bleu_1 += entry.get('BLEU-1', 0)
      total_bleu_2 += entry.get('BLEU-2', 0)
      total_bleu_3 += entry.get('BLEU-3', 0)
      total_bleu_4 += entry.get('BLEU-4', 0)
    else:
      total_bleu_1 += entry['BLEU-1(原,LFM)'][1]
      total_bleu_2 += entry['BLEU-2(原,LFM)'][1]
      total_bleu_3 += entry['BLEU-3(原,LFM)'][1]
      total_bleu_4 += entry['BLEU-4(原,LFM)'][1]

  avg_bleu_1 = total_bleu_1 / count
  avg_bleu_2 = total_bleu_2 / count
  avg_bleu_3 = total_bleu_3 / count
  avg_bleu_4 = total_bleu_4 / count

  # Add the average BLEU scores as a new entry
  if not scoring_result:
    avg_entry = {
        "Average BLEU-1": avg_bleu_1,
        "Average BLEU-2": avg_bleu_2,
        "Average BLEU-3": avg_bleu_3,
        "Average BLEU-4": avg_bleu_4
    }
  else:
    avg_entry = {
        "Average BLEU-1(原,LFM)": [scoring_result[len(combined_data)+1]["Average BLEU-1"], avg_bleu_1],
        "Average BLEU-2(原,LFM)": [scoring_result[len(combined_data)+1]["Average BLEU-2"], avg_bleu_2],
        "Average BLEU-3(原,LFM)": [scoring_result[len(combined_data)+1]["Average BLEU-3"], avg_bleu_3],
        "Average BLEU-4(原,LFM)": [scoring_result[len(combined_data)+1]["Average BLEU-4"], avg_bleu_4]
    }

  print("Average BLEU-1:", avg_bleu_1)
  print("Average BLEU-2:", avg_bleu_2)
  print("Average BLEU-3:", avg_bleu_3)
  print("Average BLEU-4:", avg_bleu_4)


  # Calculate and print the average BLEU score
  avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
  combined_data.append({"平均 BLEU 分數": avg_bleu_score})

  combined_data.append(avg_entry)

  return combined_data

# 2. Main

### Settings

In [None]:
# Modes
Language = "海岸阿美族語"

Random_select = True # 是否要從Datastore中隨機抽取100筆資料 (如果False，要提供 Test_data_path)

Download_random_data = True # No effect if Random_select is False

Test_data_path = "" # No effect if Random_select is True (Only effect when Random_select is False)

#---------------------------------------
# Knn parameters
Knn_k = 10
COT_num = 2
Find_words = True #是否要提供詞表作為參考
file_title = "(海岸阿美_k=10_COT_2e)"

#---------------------------------------
# Learn from mistakes(or re-ordering) parameters

LFM_test_data_mode = 0 # 0: Data from "Translating Pipeline" 1: Data from "Re-ordering Correct Answer" (Mode 1 only use for testing and observation)

LFM_ref_data_mode = 0 # Reference from "Translating Mistakes" 1: Reference from "Re-ordering Correct Answer"

LFM_k = 2
LFM_find_words = True #是否要提供詞表作為參考
LFM_file_title = "(海岸阿美_Shuffle_LFM)"

###### main functions

In [None]:
from google.colab import files
def save_and_download(info, data, path):
  with open(path, "w", encoding="utf-8") as file:
    json.dump(data, file, ensure_ascii=False, indent=2)
  files.download(path)
  print(f"已將{info}存至: {path}")

In [None]:
import os
import random

def create_test_data(random_select, download_random_data, Test_data_path, version):

  test_datastore_embeddings = dict(all_datastore_embeddings)

  if random_select == False:
    test_data = json.load(open(Test_data_path, "r"))
    for i in test_data.values():
      del test_datastore_embeddings[i]
  else:
    Test_data_path = ""
    test_data = {}
    for i in range(100):
      temp = random.choice(list(test_datastore_embeddings.keys()))
      test_data[all_ch2amis[temp]] = temp
      del test_datastore_embeddings[temp]

    if Download_random_data == True:
      Test_data_path = "/content/" + Language + "_隨機抽取100筆_v" + str(version) + ".json"
      save_and_download("test_data", test_data, Test_data_path)
  return test_data, test_datastore_embeddings, Test_data_path

In [None]:
def get_incexp(k):
  pf = []
  for i in range(k):
    while(True):
      temp = random.choice(list(all_ch2amis.items()))
      if temp[1] not in test_data.keys():
        break
    pf.append(temp)

  return create_wrong_set(LFM_ref_data_mode, pf, test_datastore_embeddings)

### Pipeline

In [None]:
version = 1
print("Running version:", version)

#建立測試資料集 (test_data: 100句正確答案， test_datastore_embeddings: 其餘約350句)
test_data, test_datastore_embeddings, Test_data_path = create_test_data(Random_select, Download_random_data, Test_data_path, version)

Running version: 1


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

已將test_data存至: /content/海岸阿美族語_隨機抽取100筆_v1.json


In [None]:
import openai
print(openai.__version__)


0.28.1


##### Translate pipeline

In [None]:
if not LFM_test_data_mode:
  # 將中文翻譯成原住民語
  translated_data = translate_to_amei(list(test_data.values()), test_datastore_embeddings, Knn_k, COT_num, Find_words)

  # 儲存翻譯檔案
  save_and_download("翻譯結果", translated_data, "/content/" + file_title + "translated_data_v" + str(version) + ".json")

  # Scoring
  scoring_result = bleu_scoring(translated_data, test_data)
  save_and_download("評分結果", scoring_result, "/content/" + file_title + "BleuScore_v" + str(version) + ".json")

else:
  translated_data = 0

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


1.
沒事我還好，謝謝老師。


IndexError: index out of range in self

##### Learn from mistakes pipeline

In [None]:
# 建立或讀取測試資料集
LFM_test_data = create_LFM_data(LFM_test_data_mode, list([translated_data, test_data])[LFM_test_data_mode])

# LFM_test_data_mode=1 時，先對資料集評分
if LFM_test_data_mode:
  scoring_result = bleu_scoring(LFM_test_data, test_data)
  save_and_download("打亂結果", scoring_result, "/content/Shuffle_BleuScore_v" + str(version) + ".json")

# 執行 Revision
pf = get_incexp(2)
LFM_result = batch_error_checking(LFM_ref_data_mode, LFM_test_data, pf, test_datastore_embeddings, LFM_k, Knn_k, LFM_find_words)

# 儲存Revision檔案
save_and_download("Revision結果", LFM_result, "/content/" + LFM_file_title + "revision_data_v" + str(version) + "_gpt4.json")

# Scoring
LFM_scoring_result = bleu_scoring(LFM_result, test_data, scoring_result)
save_and_download("LFM評分結果", LFM_scoring_result, "/content/" + LFM_file_title + "BleuScore_v" + str(version) + "_gpt4.json")

NameError: name 'translated_data' is not defined

# Testing