In [15]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
import json
from collections import defaultdict
from tqdm import tqdm
import re
import ftfy


In [16]:
comments = json.load(open("../../data/raw/rtl/luxembourgish/rtl_comments_clean_2012-2024.json", encoding="utf-8"))
articles = json.load(open("../../data/raw/rtl/luxembourgish/rtl_news_articles_clean_2012-2024.json", encoding="utf-8"))

In [17]:
for item in articles:
    item['text'] = ftfy.fix_text(item['text'])

for item in comments:
    item['text'] = ftfy.fix_text(item['text'])

### Filter

In [19]:
comments_clean = []
length = len(comments)
for comment in comments:
    if comment["text"].strip() == "":
        continue
    if "http" in comment["text"] or "www" in comment["text"]:
        continue
    if "@" in comment["text"][0]:
        continue
    comments_clean.append(comment)

length_after = len(comments_clean)
print(f"Filtered out {length - length_after} comments. Remaining comments: {length_after}")

Filtered out 185211 comments. Remaining comments: 842575


In [20]:
articles_clean = []
length = len(articles)
for article in articles:
    article["text"] = re.sub(r'\[.*?\]', '', article["text"])
    if article["text"].strip() == "":
        continue
    if len(article["text"].split()) > 300:
        continue
    articles_clean.append(article)

length_after = len(articles_clean)
print(f"Filtered out {length - length_after} articles. Remaining articles: {length_after}")

Filtered out 42094 articles. Remaining articles: 193994


In [21]:
comments_df = pd.DataFrame(comments_clean)
articles_df = pd.DataFrame(articles_clean)

In [22]:
comments_df["article_id"] = comments_df["context_id"].apply(lambda x: x.split("|")[-1])
comments_df["article_id"] = comments_df["article_id"].astype(str)
articles_df["article_id"] = articles_df["article_id"].astype(str)

In [23]:
df = comments_df.merge(articles_df, left_on="article_id", right_on="article_id", suffixes=("_comment", "_article"))

In [24]:
df = df.sort_values(by=["date_created"])

In [25]:
# Delete multiple replies (same user replying to the same article more than once)
length_before = len(df)
df = df.drop_duplicates(subset=["user_id", "text_id"], keep="first")
df = df.drop_duplicates(subset=["user_id", "post_id", "text_comment"], keep="first")
length_after = len(df)
print(f"Removed {length_before - length_after} duplicate replies")

Removed 44698 duplicate replies


In [26]:
# drop users with less than 4 conversations because we require at least 3 demonstrations and 1 for actual training/fitting
length = [len(df[df["user_id"] == user_id]) for user_id in df["user_id"].unique()]

authors_to_keep = df["user_id"].value_counts()[df["user_id"].value_counts() >= 4].index
df = df[df["user_id"].isin(authors_to_keep)]

print(f'Dropped {len(length) - len(df["user_id"].unique())} users with less than 4 conversations')

Dropped 7083 users with less than 4 conversations


In [27]:
np.random.seed(42)  # For reproducibility

# randomly sample 15% of the users as test users
test_users = df["user_id"].unique()
test_users = np.random.choice(test_users, size=int(len(test_users) * 0.15), replace=False)

# create test and train df
train_df = df[~df["user_id"].isin(test_users)]
test_df = df[df["user_id"].isin(test_users)]

In [28]:
# Create n-shot demonstrations separately for train and test sets
def create_n_shot_examples(source_df, n_shots=3, name="train"):
    # Organize conversational data by discourse participant
    author_conversations = defaultdict(list)
    for _, row in source_df.iterrows():
        author = row['user_id']
        conversation = row['text_article']
        reply = row['text_comment']
        timestamp = row['date_created']
        author_conversations[author].append({
            "conversation": conversation,
            "reply": reply,
            "timestamp": timestamp
        })

    # Generate structured conversational instances with n-shot demonstrations
    training_instances = []

    for author, conversations in tqdm(author_conversations.items(), desc=f"Processing {name} data"):
            conversations.sort(key=lambda x: x['timestamp'])
            conversations = conversations[-n_shots:]
            messages = []
            
            for i in range(len(conversations)):
                messages.append({"role": "user", "content": conversations[i]['conversation']})
                messages.append({"role": "assistant", "content": conversations[i]['reply']})

            training_instances.append({
                    "messages": messages,
            })
    
    return training_instances


for n in [30]:
    # Generate training data from the pre-existing train/test split
    n_shots = n  # Parameterizable based on experimental requirements
    
    # Generate train data
    train_data = create_n_shot_examples(train_df, n_shots=n_shots, name="train")
    print(f"Generated {len(train_data)} training examples")

    # Generate evaluation data
    test_data = create_n_shot_examples(test_df, n_shots=n_shots, name="evaluation")
    print(f"Generated {len(test_data)} evaluation examples")


    # Save the data to files
    with open(f'../../data/intermediate/lux_{n_shots}-shot_train.json', 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=2)

    with open(f'../../data/intermediate/lux_{n_shots}-shot_test.json', 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=2)

Processing train data: 100%|██████████| 3818/3818 [00:00<00:00, 29275.94it/s]


Generated 3818 training examples


Processing evaluation data: 100%|██████████| 673/673 [00:00<00:00, 30174.53it/s]


Generated 673 evaluation examples
