In [1]:
import pandas as pd, numpy as np, random, csv, os
from convokit import Corpus, download, Utterance


In [11]:
#list of subreddits - commented out the ones which are ignored
subreddits = [
    # <Males> 500-1000 speakers and 2-3k utterances
    "NinjasHyper", 
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",

    # <Females> 500-1.5k speakers and 3-5k utterances
    "Pokimane", 
    "Amouranth",
    "legendarylea", 
    "KittyKatGaming",
    "lilypichu",
    ]

In [12]:
# Create a master corpus with the sampled utterances
sampled_utterances = []

for subreddit in subreddits:
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    stats = corpus.print_summary_stats()
    utterances = [utt for utt in corpus.iter_utterances() if utt.text]  # Filter out utterances with missing text
    sampled_utterances.extend(utterances)

master_corpus = Corpus(utterances=sampled_utterances)
print("Master corpus created successfully.")
master_corpus.print_summary_stats()


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Number of Speakers: 672
Number of Utterances: 1708
Number of Conversations: 944
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-DanTDM
Number of Speakers: 280
Number of Utterances: 1887
Number of Conversations: 1496
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Angory_Tom
Number of Speakers: 695
Number of Utterances: 2686
Number of Conversations: 935
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NakeyJakey
Number of Speakers: 818
Number of Utterances: 2183
Number of Conversations: 453
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-LazarBeam
Number of Speakers: 955
Number of Utterances: 2074
Number of Conversations: 744
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Pokimane
Number of Speakers: 1150
Number of Utterances: 3548
Number of Conversations: 1017
Dataset already exists at /Users/admin/.con

In [13]:
import re
import langdetect
from convokit import Corpus, Utterance

# Define the list of subreddits
subreddits = [
    "NinjasHyper", 
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",
    "Pokimane", 
    "Amouranth",
    "legendarylea", 
    "KittyKatGaming",
    "lilypichu",
]

# Define a function to filter utterances based on specified criteria
def filter_utterance(curr_utt):
    banned_words = ["bot", "Bot", "I am a bot", "[removed]", "[deleted]", "**", "^^", "imgur"]
    subreddit_keywords = [subreddit.lower() for subreddit in subreddits]
    banned_words.extend([word.lower() for word in banned_words])  # Include lowercase versions
    
    # Exclude utterances with banned words or from AutoModerator
    if any(word in curr_utt.text for word in banned_words) \
            or curr_utt.speaker.id == "AutoModerator" \
            or len(curr_utt.text) < 5:
        return False

    # Exclude non-English utterances
    try:
        if langdetect.detect(curr_utt.text) != 'en':
            return False
    except:
        # If language detection fails, assume it's not in English
        return False

    # Include utterances that mention Twitch, "merch", "video", "youtube", "clip", "stream", "buy", "music", or "song"
    if any(keyword in curr_utt.text.lower() for keyword in ["twitch", "merch", "video", "youtube", "clip", "stream", "buy", "music", "song"]):
        return True

    # Include utterances containing pronouns or subreddit keywords (including substrings)
    if any(re.search(r'\b{}\b'.format(word), curr_utt.text, re.IGNORECASE) for word in subreddit_keywords):
        return True

    return False

# Apply the filter function to create the filtered sample
filtered_utterances = [utt for utt in master_corpus.iter_utterances() if filter_utterance(utt)]

# Create a new corpus with the filtered utterances
filtered_corpus = Corpus(utterances=filtered_utterances)

# Save the filtered corpus to a new file
filtered_corpus.dump("filtered_corpus")

print("Filtered corpus created and saved successfully.")
filtered_corpus.print_summary_stats()


Filtered corpus created and saved successfully.
Number of Speakers: 1839
Number of Utterances: 2899
Number of Conversations: 1587


In [21]:
import pandas as pd
import random
import csv
import os
from convokit import Corpus, download

# List of subreddits - commented out the ones which are ignored
subreddits = [
    # <Males> 500-1000 speakers and 2-3k utterances
    "NinjasHyper", 
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",

    # <Females> 500-1.5k speakers and 3-5k utterances
    "Pokimane",
    "Amouranth",
    "legendarylea",
    "KittyKatGaming",
    "lilypichu",
]

cols = [
    "utterance_id",
    "subreddit",
    "thread_title",
    "convo",
    "random_speaker",
    "utterance",
    "gender",
    "score",
    "num_comments",
    "Parasocial Language"  
]

data = pd.DataFrame(columns=cols)

for subreddit in subreddits:
    temp_list = []
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    stats = corpus.print_summary_stats()

    for _ in range(200):  # Sample 200 utterances per subreddit
        while True:
            convo = random.choice(list(corpus.iter_conversations()))  # Sample a conversation
            utt = random.choice(list(convo.iter_utterances()))  # Sample an utterance from the conversation
            if utt.text and utt.text != "[removed]" and utt.text != "[deleted]" and utt.speaker.id != "AutoModerator" and len(utt.text) >= 5 and "I am a bot" not in utt.text and "Bot" not in utt.text and not utt.text.startswith("**") and "^^" not in utt.text and "imgur" not in utt.text:
                convo_utts = list(convo.iter_utterances())
                convo_texts = []
                for curr_utt in convo_utts:
                    if curr_utt.text and curr_utt.text != "[removed]" and curr_utt.text != "[deleted]" and curr_utt.speaker.id != "AutoModerator" and not curr_utt.text.startswith("**") and "^^" not in curr_utt.text and "imgur" not in curr_utt.text and len(curr_utt.text) >= 5 and "I am a bot" not in curr_utt.text and "Bot" not in curr_utt.text:
                        convo_texts.append(f"{curr_utt.speaker.id}: {curr_utt.text}")
                if not convo_texts:
                    continue
                convo_text = " ".join(convo_texts).strip()  # Combine utterances into one text
                if utt.id in data["utterance_id"].values:  # Check if utterance already sampled
                    print(f"Duplicate utterance found: '{convo_text}'. Reselecting...")
                    continue
                else:
                    print(f"Selected utterance: '{convo_text}' from subreddit: {subreddit}")
                    temp_list.append([utt.id, subreddit, convo.meta["title"], convo_text, utt.speaker.id, utt.text, "", utt.meta["score"], convo.meta["num_comments"], ""])
                    break

    temp_df = pd.DataFrame(temp_list, columns=cols)
    data = pd.concat([data, temp_df], ignore_index=True)

# Define the path to the Downloads directory
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")

# Define the path to the CSV file in the Downloads directory
csv_file_path = os.path.join(downloads_dir, "sampled_utterances.csv")

# Write the DataFrame to CSV
data.to_csv(csv_file_path, index=False)

print(f"Sampled utterances saved to {csv_file_path}")


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Number of Speakers: 672
Number of Utterances: 1708
Number of Conversations: 944
Selected utterance: 'Atruen: Hey so I saw a montage of ninja and decided to start watching his stream, he played a match and then just started randomly dancing to a Japanese song? I’m so confused  myphiber: first win of the day Atruen: Lol where did that start tho' from subreddit: NinjasHyper
Selected utterance: 'Knickerbockerstape: Save dat money ft. Fetty Wap by Lil Dicky Dakotamannino: Also, his donation song is I need a dollar by aloe black' from subreddit: NinjasHyper
Selected utterance: 'oplix: you honestly can't be serious wickedc0ntender: What do u mean?' from subreddit: NinjasHyper
Selected utterance: 'RadioFreeZerg_: I know whenever he gets his first win, he often does his pon pon dance. Out of curiosity, does he pay for rights to use the music, or how does that work? gokickrocks-: I’ve been wondering that as well, es