In [None]:
import pickle
import numpy as np
import pandas as pd
import re

In [None]:
# load the data
with open('../../data/intermediate/english_conversations.pkl', 'rb') as f:
    conversations = pickle.load(f)

FileNotFoundError: [Errno 2] No such file or directory: '../data/english/conversations_test.pkl'

### Filter

In [5]:
# strip every conversation to length = 2 to equal the german dataset
conversations = [conversation[:2] for conversation in conversations]

In [6]:
# drop conversations whith less than 2 tweets
lengths = [len(conversation) for conversation in conversations]
conversations = [conversation for conversation in conversations if len(conversation) > 1]
print(f'Dropped {len(lengths) - len(conversations)} conversations with less than 2 tweets')

Dropped 0 conversations with less than 2 tweets


In [7]:
# drop conversations with less than 2 participants
length = [len(conversation) for conversation in conversations]
conversations = [conversation for conversation in conversations if len(set([tweet['screen_name'] for tweet in conversation])) > 1]
print(f'Dropped {len(length) - len(conversations)} conversations with less than 2 participants')

Dropped 114334 conversations with less than 2 participants


In [8]:
# drop conversations where the first tweet is also a reply to another tweet
length = [len(conversation) for conversation in conversations]
conversations = [conversation for conversation in conversations if str(conversation[0]['reply_to_id']) == "nan"]
print(f'Dropped {len(length) - len(conversations)} conversations where the first tweet is also a reply to another tweet')

Dropped 0 conversations where the first tweet is also a reply to another tweet


In [9]:
# drop conversations where a tweet contains a link
length = [len(conversation) for conversation in conversations]
conversations = [conversation for conversation in conversations if not any("http" in tweet["full_text"] or "www" in tweet["full_text"] or "https" in tweet["full_text"] for tweet in conversation)]
print(f'Dropped {len(length) - len(conversations)} conversations where a tweet contains a link')

Dropped 4106155 conversations where a tweet contains a link


In [10]:
# correct all html artifacts (e.g. &gt;)
import html

for conversation in conversations:
    for tweet in conversation:
        tweet["full_text"] = html.unescape(tweet["full_text"])

### Generate Train & Testdata

In [11]:
# generate onstring conversations
df = pd.DataFrame()
conversation_list = []
reponse_list = []
author_list = []
timestamp_list = []

for conversation in conversations:
    out_string = ""
    
    for tweet in conversation[:-1]:

        if str(tweet["screen_name"]) == "nan":
            continue

        out_string += ">"+ tweet["screen_name"] + ": " + tweet["full_text"]

    if out_string == "":
        continue
    conversation_list.append(out_string.strip("\n"))
    reponse_list.append(conversation[-1]["full_text"])
    author_list.append(conversation[-1]["screen_name"])
    timestamp_list.append(conversation[-1]["created_at"])

df["conversation"] = conversation_list
df["response"] = reponse_list
df["author"] = author_list
df["created_at"] = timestamp_list

In [12]:
# drop duplicates
length = len(df)
df = df.drop_duplicates(subset=["conversation", "response"])
print(f'Dropped {length - len(df)} duplicates')

Dropped 786 duplicates


In [13]:
# remove users in the top 5% of reply counts
length_before = len(df)
top_1_percent_users = df["author"].value_counts().quantile(0.99)
df = df[~df["author"].isin(df["author"].value_counts()[df["author"].value_counts() >= top_1_percent_users].index)]
length_after = len(df)
print(f"Removed {length_before - length_after} replies from users in the top 5% of reply counts")

Removed 2195 replies from users in the top 5% of reply counts


In [14]:
# drop users with less than 4 conversations because we require at least 3 demonstrations and 1 for actual training/fitting
length = [len(df[df["author"] == author]) for author in df["author"].unique()]

for author in df["author"].unique():
    if len(df[df["author"] == author]) < 3:
        df = df[df["author"] != author]

print(f'Dropped {len(length) - len(df["author"].unique())} users with less than 3 conversations')

Dropped 4282 users with less than 3 conversations


In [15]:
df = df.sort_values(by=["author", "created_at"]).reset_index(drop=True)

In [16]:
df["author"].value_counts()

author
ResistNonsense    21
Zetetic3          21
ishotasegod       21
CrichtonCathy     21
birrion           21
                  ..
DBV122             3
DA_Memes1          3
calldonnad         3
calirozzy          3
DavidZublick       3
Name: count, Length: 2211, dtype: int64

In [86]:
np.random.seed(42)  # For reproducibility

# randomly sample 15% of the users as test users
test_users = df["author"].unique()
test_users = np.random.choice(test_users, size=int(len(test_users) * 0.15), replace=False)

# create test and train df
train_df = df[~df["author"].isin(test_users)]
test_df = df[df["author"].isin(test_users)]

In [87]:
import json
from collections import defaultdict
from tqdm import tqdm

# Create n-shot demonstrations separately for train and test sets
def create_n_shot_examples(source_df, n_shots=3, name="train"):
    # Organize conversational data by discourse participant
    author_conversations = defaultdict(list)
    for _, row in source_df.iterrows():
        author = row['author']
        conversation = row['conversation']
        response = row['response']
        timestamp = row['created_at']
        author_conversations[author].append({
            "conversation": conversation,
            "response": response,
            "timestamp": timestamp
        })

    # Generate structured conversational instances with n-shot demonstrations
    training_instances = []

    for author, conversations in tqdm(author_conversations.items(), desc=f"Processing {name} data"):
        if len(conversations) >= n_shots + 1:
            conversations.sort(key=lambda x: x['timestamp'])
            
            for i in range(len(conversations) - n_shots):
                messages = []
                
                # Add system message
                messages.append({"role": "system", "content": "You are a social media user responding to tweets. Keep your replies consistent with your previous writing style and the perspectives you've expressed earlier."})
                # Structured demonstration sequences
                for j in range(n_shots):
                    messages.append({"role": "user", "content": conversations[i+j]['conversation']})
                    messages.append({"role": "assistant", "content": conversations[i+j]['response']})
                
                # Target discourse context
                messages.append({"role": "user", "content": conversations[i+n_shots]['conversation']})
                
                training_instances.append({
                    "prompt": messages,
                    "completion":[{"role": "assistant", "content": conversations[i+n_shots]['response']}],
                })
    
    return training_instances


for n in [3]:
    # Generate training data from the pre-existing train/test split
    n_shots = n  # Parameterizable based on experimental requirements
    
    # Generate train data
    train_data = create_n_shot_examples(train_df, n_shots=n_shots, name="train")
    print(f"Generated {len(train_data)} training examples")

    # Generate evaluation data
    test_data = create_n_shot_examples(test_df, n_shots=n_shots, name="evaluation")
    print(f"Generated {len(test_data)} evaluation examples")


    # Save the data to files
    with open(f'../data/english/eng_{n_shots}-shot_train.json', 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=2)

    with open(f'../data/english/eng_{n_shots}-shot_test.json', 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=2)

Processing train data: 100%|██████████| 1880/1880 [00:00<00:00, 27343.22it/s]


Generated 5467 training examples


Processing evaluation data: 100%|██████████| 331/331 [00:00<00:00, 38557.87it/s]


Generated 1030 evaluation examples
