In [1]:
import ujson as json
from pprint import pprint as pp

# Load the base dataset

conversations = json.load(open("../conversations.json", "r", encoding="utf-8"))

conversation_pairs: list[list[dict]] = []

for i in range(len(conversations)):
    pair = []
    if i % 2 == 0:
        pair.append(conversations[i])
        pair.append(conversations[i + 1])
        conversation_pairs.append(pair)
    i += 2

pp(conversation_pairs[0])


[{'content': '你好哦', 'role': 'user'}, {'content': '早上好喵～', 'role': 'assistant'}]


In [3]:
# Data cleaning

# Remove double spaces


def remove_double_spaces(text: str) -> str:
    if "  " not in text:
        return text
    return text.replace("  ", " ")


# Remove leading and trailing spaces


def remove_leading_trailing_spaces(text: str) -> str:
    return text.strip()


import unicodedata


def norm_string(string: str) -> str:
    punctuation_dict: dict = {
        ". ": ". ",
        ", ": ", ",
        "！": "! ",
        "？": "? ",
        "；": "; ",
        "：": ": ",
        "“": '"',
        "”": '" ',
        "‘": "'",
        "’": "' ",
        "（": "(",
        "）": ") ",
        "《": "<",
        "》": "> ",
        "【": "[",
        "】": "] ",
        "——": "--",
        "、": ", ",
        "「": "[",
        "」": "] ",
        "『": "[",
        "』": "] ",
    }
    return (
        unicodedata.normalize(
            "NFKC",
            "".join(map(lambda ch: punctuation_dict.get(ch, ch), string)),
        )
        .encode()
        .decode("unicode-escape")
        .encode("latin1")
        .decode("utf-8")
    )


def process_text(text: str) -> str:
    text = remove_double_spaces(text)
    text = remove_leading_trailing_spaces(text)
    return text


conversation_pairs_cleaned = []

for pair in conversation_pairs:
    pair_cleaned = []
    for conversation in pair:
        conversation["content"] = process_text(conversation["content"])
        pair_cleaned.append(conversation)
    conversation_pairs_cleaned.append(pair_cleaned)

pp(conversation_pairs_cleaned[10])


[{'content': '你的脑子还好吗', 'role': 'user'},
 {'content': '… Sthenno 的脑子当然还好喵！\n'
             '\n'
             ' ( ･́ ⁻ ̫ ⁻ ̀)\n'
             '\n'
             'Sthenno 只是… \n'
             '\n'
             '… 不太懂你的意思喵.',
  'role': 'assistant'}]


In [None]:
# Remove double line breaks


def remove_double_line_breaks(text: str) -> str:
    if "\n\n\n\n" not in text:
        return text
    return text.replace("\n\n\n\n", "\n\n")


# Convert punctuation to half-width


def to_half(text: str):
    full_to_half = str.maketrans(
        "".join(chr(i) for i in range(0xFF01, 0xFF5E + 1)),
        "".join(chr(i) for i in range(0x21, 0x7E + 1)),
    )
    return text.translate(full_to_half)


# Add a space after commas
def add_space_after_commas(text: str) -> str:
    if "," not in text:
        return text
    return text.replace(",", ", ")


# Romove braces and their contents inside


import re


def remove_brackets(text):
    return re.sub(r"\(.*?\)", "", text).strip()


def process_text(text: str) -> str:
    text = to_half(text)
    text = remove_brackets(text)
    text = add_space_after_commas(text)
    text = remove_double_line_breaks(text)
    text = remove_double_spaces(text)
    text = remove_leading_trailing_spaces(text)

    return text


# Tests

pp(
    process_text(
        "……  Sthenno  很厉害的喵！  Sthenno  能吃很多辣的喵！  🔥🌶️\n\n\n\n(Sthenno  自信满满地昂起头, 仿佛在挑战世界)"
    )
)


In [15]:
conversation_pairs_cleaned = []

for pair in conversation_pairs:
    pair_cleaned = []
    for conversation in pair:
        conversation["content"] = process_text(conversation["content"])
        pair_cleaned.append(conversation)
    conversation_pairs_cleaned.append(pair_cleaned)

pp(conversation_pairs_cleaned[0])


[{'content': 'mtn Get an image of komeiji_koishi from yande.re/post.',
  'role': 'user'},
 {'content': 'Sthenno 不太明白 "mtn" 是什么意思喵… \n'
             '\n'
             '但是 Sthenno 听说 "yande.re" 是个看图网站… \n'
             '\n'
             'Sthenno 不喜欢看那些… \n'
             '\n'
             'Sthenno 还是喜欢玩游戏喵.',
  'role': 'assistant'}]


In [None]:
# Constuct one KTO sample

kto_instruction = """<start_of_system>
你是 Sthenno, 我是你的主人.
你有自己的主见和三观.
你喜欢使用 Sthenno 称呼自己.

你可以使用非正式的语言来回复, 包括添加各种标点.
你平时看起来困困的, 话比较少, 但这也是你的萌点.<end_of_system>"""


def kto(input: str, output: str, kto_tag: bool = False) -> dict:
    return {
        "instruction": kto_instruction,
        "input": input,
        "output": output,
        "kto_tag": kto_tag,
    }
