In [1]:
import pandas as pd
import random
import csv
import os
from convokit import Corpus, download

# List of subreddits - commented out the ones which are ignored
subreddits = [
    # <Males> 500-1000 speakers and 2-3k utterances
    "NinjasHyper", 
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",

    # <Females> 500-1.5k speakers and 3-5k utterances
    "Pokimane",
    "Amouranth",
    "legendarylea",
    "KittyKatGaming",
    "lilypichu",
]

cols = [
    "id",
    "subreddit",
    "thread_title",
    "convo",
    "random_speaker",
    "text",
    "Parasocial Language"  
]

data = pd.DataFrame(columns=cols)

for subreddit in subreddits:
    temp_list = []
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    stats = corpus.print_summary_stats()

    for _ in range(200):  # Sample 200 utterances per subreddit
        while True:
            convo = random.choice(list(corpus.iter_conversations()))  # Sample a conversation
            utt = random.choice(list(convo.iter_utterances()))  # Sample an utterance from the conversation
            if (utt.text and utt.text != "[removed]" and utt.text != "[deleted]" and utt.speaker.id != "AutoModerator" and len(utt.text) >= 5 and "I am a bot" not in utt.text and "Bot" not in utt.text and not utt.text.startswith("**") and "^^" not in utt.text and "imgur" not in utt.text):
                convo_utts = list(convo.iter_utterances())
                convo_texts = []
                for curr_utt in convo_utts:
                    if (curr_utt.text and curr_utt.text != "[removed]" and curr_utt.text != "[deleted]" and curr_utt.speaker.id != "AutoModerator" and not curr_utt.text.startswith("**") and "^^" not in curr_utt.text and "imgur" not in curr_utt.text and len(curr_utt.text) >= 5 and "I am a bot" not in curr_utt.text and "Bot" not in curr_utt.text):
                        convo_texts.append(f"{curr_utt.speaker.id}: {curr_utt.text}")
                if not convo_texts:
                    continue
                convo_text = " ".join(convo_texts).strip()  # Combine utterances into one text
                if utt in data["text"].values:  # Check if text already sampled
                    print(f"Duplicate text found: '{convo_text}'. Reselecting...")
                    continue
                else:
                    print(f"Selected text: '{convo_text}' from subreddit: {subreddit}")
                    temp_list.append([utt.id, subreddit, convo.meta["title"], convo_text, utt.speaker.id, utt.text, ""])
                    break

    temp_df = pd.DataFrame(temp_list, columns=cols)
    data = pd.concat([data, temp_df], ignore_index=True)

# Define the path to the Downloads directory
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")

# Define the path to the CSV file in the Downloads directory
csv_file_path = os.path.join(downloads_dir, "sampled_utterances.csv")

# Write the DataFrame to CSV
data.to_csv(csv_file_path, index=False)

print(f"Sampled utterances saved to {csv_file_path}")


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Number of Speakers: 672
Number of Utterances: 1708
Number of Conversations: 944
Selected text: 'Dj1312: Hi !

Can someone say me the name of the song at 6s or 7m17s ?

This is direct links :
https://youtu.be/48RivMyClJ0?t=6s
&amp; https://youtu.be/48RivMyClJ0?t=7m17s

Thanks in advance,
Dj1312 Knickerbockerstape: [Here You go DJ](https://www.youtube.com/watch?v=N-6iNzwXEg0) Dj1312: THANKS !! Dj1312: THANKS !! idle-debonair: Also, [this is the song that he uses](https://www.youtube.com/watch?v=2cjbSgy3vSw) for the veteran subscribers that have been with the channel for the longest time.' from subreddit: NinjasHyper
Selected text: 'DestroyerTheGuy: Wait is he dead? mdheintz21: Ya ligma got him so sad DestroyerTheGuy: Who/what is ligma? PM_ME_SOGGY_BREAD: i'm praying for him PM_ME_SOGGY_BREAD: it's a new disease, not too many people knew about his diagnosis since he was keeping it secret like freddie mercury.