In [26]:
import pandas as pd
from collections import deque
import pandas as pd
import numpy as np
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import pickle
from collections import defaultdict
import numpy as np

### Load and Merge Data

In [27]:
df_tweets = pd.read_csv("../../data/raw/twitter/german/GermanyMdBTweets_2023.csv")
df_replies = pd.read_csv("../../data/raw/twitter/german/GermanyReplies2023.csv")

In [28]:
conversation = defaultdict(list)

for id, tweet in df_tweets.iterrows():
    tweet_dict = {
        # Tweet and Reply metadata
        'id': str(tweet['id']),
        'text': tweet['text'],
        'author_id': str(tweet['author_id']),
        'created_at': tweet['created_at'],

        # Tweet metadata
        "username": tweet['username']
    }
    
    # Append the tweet to the corresponding conversation
    conversation[tweet["conversation_id"]].append(tweet_dict)

for id, tweet in df_replies.iterrows():
    tweet_dict = {
        # Tweet and Reply metadata
        'id': str(tweet['id']),
        'text': tweet['text'],
        'author_id': str(tweet['author_id']),
        'created_at': tweet['created_at'],

        # Reply metadata
        'in_reply_to_user_id': str(tweet['in_reply_to_user_id'])
    }
    
    # Append the tweet to the corresponding conversation
    conversation[tweet["conversation_id"]].append(tweet_dict)

In [29]:
# iterate through dict and sort each list by created_at with the earliest first
for conv_id, tweets in conversation.items():
    conversation[conv_id] = sorted(tweets, key=lambda x: x['created_at'])

In [30]:
individual_replies = defaultdict(list)

for conv_id, tweets in conversation.items():

    if len(tweets) < 2:
        #print(f"Skipping conversation {tweets} because it has no matched replies")
        continue

    if tweets[0].get("in_reply_to_user_id", None) is not None:
        #print(f"Skipping conversation {tweets} because the first tweet is a reply")
        continue

    for tweet in tweets[1:]:

        tweet["conversation_id"] = conv_id

        if tweet.get("in_reply_to_user_id", None) is None:
            #print(f"Skipping tweet {tweet} because it is not a reply")
            continue

        if tweet["in_reply_to_user_id"] != tweets[0]["author_id"]:
            #print(f"Skipping tweet {tweet} because it is not a reply to the first tweet")
            continue
        

        if tweet["conversation_id"] in [r[1]["conversation_id"] for r in individual_replies[tweet["author_id"]]]:
            #print(f"Skipping tweet {tweet} because it is a duplicate reply or the user has already replied to the first tweet")
            continue

        individual_replies[tweet["author_id"]].append((tweets[0], tweet))


In [31]:
df_replies = pd.DataFrame([
    {
        'reply_id': reply[1]['id'],
        'reply': reply[1]['text'],
        'reply_author_id': reply[1]['author_id'],
        'reply_created_at': reply[1]['created_at'],
        "reply_to_user_id": reply[1]['in_reply_to_user_id'],

        'base_id': reply[0]['id'],
        "base_username": reply[0]['username'],
        'base_author_id': reply[0]['author_id'],
        'base_text': reply[0]['text']
    }
    for replies in individual_replies.values() for reply in replies
])

### Filter

In [32]:
# Catch cases where reply is not to the original author
len_before = len(df_replies)
df_replies = df_replies[df_replies["reply_to_user_id"] == df_replies["base_author_id"]]
len_after = len(df_replies)
print(f"Removed {len_before - len_after} replies")

Removed 0 replies


In [33]:
# delete any row where base_text starts with an @ because it's a reply
length_before = len(df_replies)
df_replies = df_replies[~df_replies["base_text"].str.startswith("@", na=False)]
length_after = len(df_replies)
print(f"Removed {length_before - length_after} replies")

Removed 29304 replies


In [34]:
# delete any row where either reply or in_reply_to_text contains an url
length_before = len(df_replies)
df_replies = df_replies[~df_replies["reply"].str.contains("http|www|https", na=False)]
df_replies = df_replies[~df_replies["base_text"].str.contains("http|www|https", na=False)]
length_after = len(df_replies)
print(f"Removed {length_before - length_after} rows containing URLs")

Removed 2104371 rows containing URLs


In [35]:
# Delete any duplicate replies
length_before = len(df_replies)
df_replies = df_replies.drop_duplicates(subset=["reply", "base_text"], keep="first")
length_after = len(df_replies)
print(f"Removed {length_before - length_after} duplicate replies")

Removed 5912 duplicate replies


In [36]:
# Delete any duplicate replies
length_before = len(df_replies)
df_replies = df_replies.drop_duplicates(subset=["reply_id"], keep="first")
length_after = len(df_replies)
print(f"Removed {length_before - length_after} duplicate replies")

Removed 0 duplicate replies


In [37]:
df_replies["reply_author_id"] = df_replies["reply_author_id"].astype(int)

In [38]:
# remove users in the top 5% of reply counts
length_before = len(df_replies)
top_5_percent_users = df_replies["reply_author_id"].value_counts().quantile(0.99)
df_replies = df_replies[~df_replies["reply_author_id"].isin(df_replies["reply_author_id"].value_counts()[df_replies["reply_author_id"].value_counts() >= top_5_percent_users].index)]
length_after = len(df_replies)
print(f"Removed {length_before - length_after} replies from users in the top 5% of reply counts")

Removed 117252 replies from users in the top 5% of reply counts


In [39]:
# drop users with less than 4 conversations because we require at least 3 demonstrations and 1 for actual training/fitting
length = [len(df_replies[df_replies["reply_author_id"] == reply_author_id]) for reply_author_id in df_replies["reply_author_id"].unique()]

authors_to_keep = df_replies["reply_author_id"].value_counts()[df_replies["reply_author_id"].value_counts() >= 4].index
df_replies = df_replies[df_replies["reply_author_id"].isin(authors_to_keep)]

print(f'Dropped {len(length) - len(df_replies["reply_author_id"].unique())} users with less than 3 conversations')

Dropped 76559 users with less than 3 conversations


### Generate Train & Testdata

In [40]:
df_replies["prompt"] = ">" + df_replies["base_username"] + ": " + df_replies["base_text"]

In [41]:
df_replies["reply_author_id"].value_counts()

reply_author_id
3334280543             62
176557293              62
21647895               62
930772154232262656     62
1562669284920213505    62
                       ..
1218117719532494848     4
100525691               4
1590278150021419009     4
288627622               4
952236980120838145      4
Name: count, Length: 39604, dtype: int64

In [42]:
np.random.seed(42)  # For reproducibility

# randomly sample 15% of the users as test users
test_users = df_replies["reply_author_id"].unique()
test_users = np.random.choice(test_users, size=int(len(test_users) * 0.15), replace=False)

# create test and train df
train_df = df_replies[~df_replies["reply_author_id"].isin(test_users)]
test_df = df_replies[df_replies["reply_author_id"].isin(test_users)]

In [43]:
import json
from collections import defaultdict
from tqdm import tqdm

# Create n-shot demonstrations separately for train and test sets
def create_n_shot_examples(source_df, n_shots=3, name="train"):
    # Organize conversational data by discourse participant
    author_conversations = defaultdict(list)
    for _, row in source_df.iterrows():
        author = row['reply_author_id']
        conversation = row['prompt']
        reply = row['reply']
        timestamp = row['reply_created_at']
        author_conversations[author].append({
            "conversation": conversation,
            "reply": reply,
            "timestamp": timestamp
        })

    # Generate structured conversational instances with n-shot demonstrations
    training_instances = []

    for author, conversations in tqdm(author_conversations.items(), desc=f"Processing {name} data"):
            conversations.sort(key=lambda x: x['timestamp'])
            conversations = conversations[-n_shots:]
            messages = []
            
            for i in range(len(conversations)):
                messages.append({"role": "user", "content": conversations[i]['conversation']})
                messages.append({"role": "assistant", "content": conversations[i]['reply']})

            training_instances.append({
                    "messages": messages,
            })
    
    return training_instances


for n in [30]:
    # Generate training data from the pre-existing train/test split
    n_shots = n  # Parameterizable based on experimental requirements
    
    # Generate train data
    train_data = create_n_shot_examples(train_df, n_shots=n_shots, name="train")
    print(f"Generated {len(train_data)} training examples")

    # Generate evaluation data
    test_data = create_n_shot_examples(test_df, n_shots=n_shots, name="evaluation")
    print(f"Generated {len(test_data)} evaluation examples")


    # Save the data to files
    with open(f'../../data/intermediate/ger_{n_shots}-shot_train.json', 'w', encoding='utf-8') as f:
        json.dump(train_data, f, ensure_ascii=False, indent=2)

    with open(f'../../data/intermediate/ger_{n_shots}-shot_test.json', 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=2)

Processing train data: 100%|██████████| 33664/33664 [00:04<00:00, 6760.73it/s] 


Generated 33664 training examples


Processing evaluation data: 100%|██████████| 5940/5940 [00:00<00:00, 57503.43it/s]


Generated 5940 evaluation examples
