In [48]:
import pandas as pd
import random
import csv
import os
from convokit import Corpus, download

# List of subreddits - commented out the ones which are ignored
subreddits = [
    # <Males> 500-1000 speakers and 2-3k utterances
    "NinjasHyper",
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",

    # <Females> 500-1.5k speakers and 3-5k utterances
    "Pokimane",
    "Amouranth",
    "legendarylea",
    "KittyKatGaming",
    "lilypichu",
]

cols = [
    "id",
    "subreddit",
    "thread_title",
    "convo",
    "speaker",
    "text",
    "Parasocial Language"  
]

data = pd.DataFrame(columns=cols)

for subreddit in subreddits:
    temp_list = []
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    stats = corpus.print_summary_stats()

    for _ in range(200):  # Sample 200 utterances per subreddit
        while True:
            utt = random.choice(list(corpus.iter_utterances()))  # Convert generator to list
            convo = corpus.get_conversation(utt.conversation_id)
            if (utt.text and utt.text != "[removed]" and utt.text != "[deleted]" and utt.speaker.id != "AutoModerator" and len(utt.id) == 7 and not any(c.isdigit() for c in utt.id)):
                convo_utts = list(convo.iter_utterances())  # Convert generator to list
                utt_index = convo_utts.index(utt)
                convo_texts = []
                for i in range(utt_index, len(convo_utts)):
                    curr_utt = convo_utts[i]
                    if (curr_utt.id and curr_utt.text and curr_utt.speaker.id and curr_utt.text != "[removed]" and curr_utt.text != "[deleted]" and curr_utt.speaker.id != "AutoModerator"):
                        convo_texts.append(f"{curr_utt.speaker.id}: {curr_utt.text}")
                     else:
                        break
                if not convo_texts:
                    continue
                convo_text = " ".join(convo_texts)  # Join using space 
                convo_text = convo_text.lstrip()  # Remove leading whitespace
                temp_list.append([utt.id, subreddit, convo.meta["title"], convo_text, utt.speaker.id, utt.text, ""])
                break

    temp_df = pd.DataFrame(temp_list, columns=cols)
    data = pd.concat([data, temp_df], ignore_index=True)

# Define the path to the Downloads directory
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")

# Define the path to the CSV file in the Downloads directory
csv_file_path = os.path.join(downloads_dir, "sampled_utterances.csv")

# Write the DataFrame to CSV
data.to_csv(csv_file_path, index=False)

print(f"Sampled utterances saved to {csv_file_path}")


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Number of Speakers: 672
Number of Utterances: 1708
Number of Conversations: 944
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-DanTDM
Number of Speakers: 280
Number of Utterances: 1887
Number of Conversations: 1496
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Angory_Tom
Number of Speakers: 695
Number of Utterances: 2686
Number of Conversations: 935
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NakeyJakey
Number of Speakers: 818
Number of Utterances: 2183
Number of Conversations: 453
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-LazarBeam
Number of Speakers: 955
Number of Utterances: 2074
Number of Conversations: 744
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Pokimane
Number of Speakers: 1150
Number of Utterances: 3548
Number of Conversations: 1017
Dataset already exists at /Users/admin/.con