In [2]:
import pandas as pd
import random
import csv
import os
from convokit import Corpus, download

# List of subreddits - commented out the ones which are ignored
subreddits = [
    # <Males> 500-1000 speakers and 2-3k utterances
    "NinjasHyper", 
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",

    # <Females> 500-1.5k speakers and 3-5k utterances
    "Pokimane",
    "Amouranth",
    "legendarylea",
    "KittyKatGaming",
    "lilypichu",
]

cols = [
    "id",
    "subreddit",
    "thread_title",
    "convo",
    "random_speaker",
    "text",
    "Parasocial Language"  
]

data = pd.DataFrame(columns=cols)

for subreddit in subreddits:
    temp_list = []
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    stats = corpus.print_summary_stats()

    for _ in range(200):  # Sample 200 utterances per subreddit
        while True:
            convo = random.choice(list(corpus.iter_conversations()))  # Sample a conversation
            utt = random.choice(list(convo.iter_utterances()))  # Sample an utterance from the conversation
            if (utt.text and utt.text != "[removed]" and utt.text != "[deleted]" and utt.speaker.id != "AutoModerator" and len(utt.id) == 7 and not any(c.isdigit() for c in utt.id)):
                convo_utts = list(convo.iter_utterances())
                convo_texts = []
                for curr_utt in convo_utts:
                    if (curr_utt.text and curr_utt.text != "[removed]" and curr_utt.text != "[deleted]" and curr_utt.speaker.id != "AutoModerator" and not curr_utt.text.startswith("**")):
                        convo_texts.append(f"{curr_utt.speaker.id}: {curr_utt.text}")
                if not convo_texts:
                    continue
                convo_text = " ".join(convo_texts).strip()  # Combine utterances into one text
                if convo_text in data["text"].values:  # Check if text already sampled
                    print(f"Duplicate text found: '{convo_text}'. Reselecting...")
                    continue
                else:
                    print(f"Selected text: '{convo_text}' from subreddit: {subreddit}")
                    temp_list.append([utt.id, subreddit, convo.meta["title"], convo_text, utt.speaker.id, utt.text, ""])
                    break

    temp_df = pd.DataFrame(temp_list, columns=cols)
    data = pd.concat([data, temp_df], ignore_index=True)

# Define the path to the Downloads directory
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")

# Define the path to the CSV file in the Downloads directory
csv_file_path = os.path.join(downloads_dir, "sampled_utterances.csv")

# Write the DataFrame to CSV
data.to_csv(csv_file_path, index=False)

print(f"Sampled utterances saved to {csv_file_path}")


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Number of Speakers: 672
Number of Utterances: 1708
Number of Conversations: 944
Selected text: 'treilly55: Is there a way to turn off donations sounds.. just tired of hearing save that money every 3 seconds  TheMusicFella: Nope, it's all done on the transcoders end (Aka Tyler's end).  proxalfy: What does it say when someone subscribes? I think it’s fetty wap.  treilly55: It’s lil dicky “save that money”' from subreddit: NinjasHyper
Selected text: 'youarebrotherHH: I never wanted that to end' from subreddit: NinjasHyper
Selected text: 'Fortesque-: I'm trying to find a stream moment where I think he boogie bombs someone, dabs on them and does another emote then it backfires and he dies. I also think he falls off his chair yelling. If anyone knows the exact moment I'm talking about I would love a link. Knickerbockerstape: [Pretty sure this is it](https://clips.twitch.tv/LazyAttractiveBobaBleedPurple?tt_medium