In [1]:
import pandas as pd
import json
import random

In [2]:
def format_dialogues(histories):

    with open("../../prompts/system.txt", "r", encoding="utf-8") as f:
        SYSTEM_PROMPT = f.read()
    with open("../../prompts/instruction.txt", "r", encoding="utf-8") as f:
        INSTRUCTION = f.read()

    formatted_histories = []
 
    for history in histories:
        for message in history["messages"][-2:]:
            if message["role"] == "user":
                messages = []
                messages.append({"role": "system", "content": SYSTEM_PROMPT})
                messages.append({"role": "user", "content": INSTRUCTION.format(content=message["content"])})
            elif message["role"] == "assistant":
                formatted_histories.append({"messages": messages, "human_comment": message["content"]})


    return formatted_histories

In [3]:
def format_dialogues_history(histories):

    with open("../../prompts/system.txt", "r", encoding="utf-8") as f:
        SYSTEM_PROMPT = f.read()
    with open("../../prompts/instruction.txt", "r", encoding="utf-8") as f:
        INSTRUCTION = f.read()

    formatted_histories = []

    for history in histories:
        messages = []
        messages.append({"role": "system", "content": SYSTEM_PROMPT})

        for message in history["messages"][:-1]:
            if message["role"] == "user":
                messages.append({"role": "user", "content": INSTRUCTION.format(content=message["content"])})
            elif message["role"] == "assistant":
                messages.append({"role": "assistant", "content": message["content"]})
        
        formatted_histories.append({"messages": messages, "human_comment": history["messages"][-1]["content"]})

    return formatted_histories

In [4]:
def format_dialogues_bio(histories):

    with open("../../prompts/system.txt", "r", encoding="utf-8") as f:
        SYSTEM_PROMPT = f.read()
    with open("../../prompts/instruction.txt", "r", encoding="utf-8") as f:
        INSTRUCTION = f.read()

    formatted_histories = []

    for history in histories:
        for message in history["messages"][-2:]:
            if message["role"] == "user":
                messages = []
                messages.append({"role": "system", "content": history["bio"]})
                messages.append({"role": "user", "content": INSTRUCTION.format(content=message["content"])})
            elif message["role"] == "assistant":
                formatted_histories.append({"messages": messages, "human_comment": message["content"]})


    return formatted_histories

In [5]:
def format_dialogues_bio_history(histories):

    with open("../../prompts/system.txt", "r", encoding="utf-8") as f:
        SYSTEM_PROMPT = f.read()
    with open("../../prompts/instruction.txt", "r", encoding="utf-8") as f:
        INSTRUCTION = f.read()

    formatted_histories = []

    for history in histories:
        messages = []
        messages.append({"role": "system", "content": history["bio"]})

        for message in history["messages"][:-1]:
            if message["role"] == "user":
                messages.append({"role": "user", "content": INSTRUCTION.format(content=message["content"])})
            elif message["role"] == "assistant":
                messages.append({"role": "assistant", "content": message["content"]})

        formatted_histories.append({"messages": messages, "human_comment": history["messages"][-1]["content"]})

    return formatted_histories

### | No History | No Bio | (Baseline)

In [6]:
luxembourgish_comments = json.load(open("../../data/intermediate/lux_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues(luxembourgish_comments), open("../../data/processed/lux_test.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [7]:
german_comments = json.load(open("../../data/intermediate/ger_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues(german_comments), open("../../data/processed/ger_test.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [8]:
english_comments = json.load(open("../../data/intermediate/eng_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues(english_comments), open("../../data/processed/eng_test.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [9]:
random.seed(42)
mixed_comments = luxembourgish_comments + german_comments + english_comments
mixed_comments = random.sample(mixed_comments, len(mixed_comments))
json.dump(format_dialogues(mixed_comments), open("../../data/processed/mixed_test.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

### | History | No Bio |

In [10]:
luxembourgish_comments = json.load(open("../../data/intermediate/lux_30-shot_test_with_bios.json", "r", encoding="utf-8"))

In [12]:
luxembourgish_comments = json.load(open("../../data/intermediate/lux_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_history(luxembourgish_comments), open("../../data/processed/lux_test_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [13]:
german_comments = json.load(open("../../data/intermediate/ger_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_history(german_comments), open("../../data/processed/ger_test_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [14]:
english_comments = json.load(open("../../data/intermediate/eng_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_history(english_comments), open("../../data/processed/eng_test_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [15]:
random.seed(42)
mixed_comments = luxembourgish_comments + german_comments + english_comments
mixed_comments = random.sample(mixed_comments, len(mixed_comments))
json.dump(format_dialogues_history(mixed_comments), open("../../data/processed/mixed_test_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

### | No History | Bio |

In [16]:
luxembourgish_comments = json.load(open("../../data/intermediate/lux_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_bio(luxembourgish_comments), open("../../data/processed/lux_test_bio.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [17]:
german_comments = json.load(open("../../data/intermediate/ger_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_bio(german_comments), open("../../data/processed/ger_test_bio.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [18]:
english_comments = json.load(open("../../data/intermediate/eng_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_bio(english_comments), open("../../data/processed/eng_test_bio.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [19]:
random.seed(42)
mixed_comments = luxembourgish_comments + german_comments + english_comments
mixed_comments = random.sample(mixed_comments, len(mixed_comments))
json.dump(format_dialogues_bio(mixed_comments), open("../../data/processed/mixed_test_bio.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

### | History | Bio |

In [20]:
luxembourgish_comments = json.load(open("../../data/intermediate/lux_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_bio_history(luxembourgish_comments), open("../../data/processed/lux_test_bio_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [21]:
german_comments = json.load(open("../../data/intermediate/ger_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_bio_history(german_comments), open("../../data/processed/ger_test_bio_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [22]:
english_comments = json.load(open("../../data/intermediate/eng_30-shot_test_with_bios.json", "r", encoding="utf-8"))
json.dump(format_dialogues_bio_history(english_comments), open("../../data/processed/eng_test_bio_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)

In [23]:
random.seed(42)
mixed_comments = luxembourgish_comments + german_comments + english_comments
mixed_comments = random.sample(mixed_comments, len(mixed_comments))
json.dump(format_dialogues_bio_history(mixed_comments), open("../../data/processed/mixed_test_bio_history.json", "w", encoding="utf-8"), ensure_ascii=False, indent=4)