In [1]:
import pandas as pd, numpy as np, random, csv, os
from convokit import Corpus, download, Utterance


In [2]:
#list of subreddits - commented out the ones which are ignored
subreddits = [
    # <Males> 500-1000 speakers and 2-3k utterances
    "NinjasHyper", 
    "DanTDM",
    "Angory_Tom",
    "NakeyJakey",
    "LazarBeam",

    # <Females> 500-1.5k speakers and 3-5k utterances
    "Pokimane", 
    "Amouranth",
    "legendarylea", 
    "KittyKatGaming",
    "lilypichu",
    ]

In [3]:
# Create a master corpus with the sampled utterances
sampled_utterances = []

for subreddit in subreddits:
    corpus = Corpus(filename=download (f"subreddit-{subreddit}"))
    stats = corpus.print_summary_stats()
    utterances = [utt for utt in corpus.iter_utterances() if utt.text]  # Filter out utterances with missing text
    sampled_utterances.extend(random.sample(utterances, min(500, len(utterances))))

master_corpus = Corpus(utterances=sampled_utterances)
print("Master corpus created successfully.")
master_corpus.print_summary_stats()



Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Number of Speakers: 672
Number of Utterances: 1708
Number of Conversations: 944
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-DanTDM
Number of Speakers: 280
Number of Utterances: 1887
Number of Conversations: 1496
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Angory_Tom
Number of Speakers: 695
Number of Utterances: 2686
Number of Conversations: 935
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NakeyJakey
Number of Speakers: 818
Number of Utterances: 2183
Number of Conversations: 453
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-LazarBeam
Number of Speakers: 955
Number of Utterances: 2074
Number of Conversations: 744
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Pokimane
Number of Speakers: 1150
Number of Utterances: 3548
Number of Conversations: 1017
Dataset already exists at /Users/admin/.con

In [10]:
import re
import langdetect

# Define a function to filter utterances based on specified criteria
def filter_utterance(curr_utt):
    banned_words = ["bot", "Bot"]
    pronouns = ["I", "me", "my", "mine", "myself", "he", "him", "her", "hers", "they", "them", "we", "us"]
    subreddit_keywords = [subreddit.lower() for subreddit in subreddits]
    
    # Check if the utterance contains banned words
    if any(word in curr_utt.text for word in banned_words):
        return False

    # Check if the utterance is not in English
    try:
        if langdetect.detect(curr_utt.text) != 'en':
            return False
    except:
        # If language detection fails, assume it's not in English
        return False

    # Check if the utterance mentions Twitch, "merch", and "video"
    if "twitch" in curr_utt.text.lower() and "merch" in curr_utt.text.lower() and "video" in curr_utt.text.lower():
        return True

    # Check if the utterance contains any pronouns and does not contain subreddit keywords
    if all(re.search(r'\b{}\b'.format(word), curr_utt.text, re.IGNORECASE) for word in pronouns) \
            and not any(word in curr_utt.text.lower() for word in subreddit_keywords):
        return True

    return False

# Apply the filter function to the master corpus to create the filtered sample
filtered_utterances = [utt for utt in master_corpus.iter_utterances() if filter_utterance(utt)]

# Create a new corpus with the filtered utterances
filtered_corpus = Corpus(utterances=filtered_utterances)

# Save the filtered corpus to a new file in the same folder
filtered_corpus.dump("filtered_corpus")

print("Filtered corpus created and saved successfully.")
filtered_corpus.print_summary_stats()


ModuleNotFoundError: No module named 'langdetect'

In [6]:
from convokit import Corpus, download
import pandas as pd

# Load the CSV file
csv_file_path = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/combined_utterances_2_final_csv.csv"
data = pd.read_csv(csv_file_path)

# Group the data by subreddit
grouped_data = data.groupby('subreddit')

# Initialize an empty list to store the processed data
processed_data = []

# Iterate through each group
for subreddit, group in grouped_data:
    # Load the corpus for the current subreddit
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    
    # Iterate through each utterance in the group
    for utterance_id in group['id']:
        # Get the utterance from the corpus
        utterance = corpus.get_utterance(utterance_id)
        if utterance:
            # Get the conversation ID for the utterance
            conversation_id = utterance.conversation_id
            conversation = corpus.get_conversation(conversation_id)
            if conversation:
                num_utterances = len(conversation.get_utterance_ids())
                processed_data.append({
                    'subreddit': subreddit,
                    'utterance_id': utterance_id,
                    'conversation_id': conversation_id,
                    'num_utterances': num_utterances
                })
            else:
                print(f"No conversation found for utterance {utterance_id}")
        else:
            print(f"No utterance found with ID {utterance_id}")

# Convert the processed data to a DataFrame
processed_df = pd.DataFrame(processed_data)

# Save the processed DataFrame to a new CSV file
output_file_path = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/processed_4.csv"
processed_df.to_csv(output_file_path, index=False)

print(f"Processed data saved to {output_file_path}")


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Amouranth
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Angory_Tom
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-DanTDM
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-KittyKatGaming
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-LazarBeam
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NakeyJakey
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-NinjasHyper
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Pokimane
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-legendarylea
Dataset already exists at /Users/admin/.convokit/downloads/subreddit-lilypichu
Processed data saved to /Users/admin/Documents/University/diss-data-collection/Classifier/Sample/processed_4.csv


In [5]:
from convokit import Corpus, download
import pandas as pd

# Load the CSV file
csv_file_path = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/combined_utterances_2_final_csv.csv"
data = pd.read_csv(csv_file_path)

# Group the data by subreddit
grouped_data = data.groupby('subreddit')

# Initialize an empty DataFrame to store the processed data
processed_data = pd.DataFrame()

# Iterate over each subreddit group
for subreddit, group in grouped_data:
    # Load the corpus for the current subreddit
    corpus = Corpus(filename=download(f"subreddit-{subreddit}"))
    
    # Extract the conversation IDs from the group and remove duplicates
    convo_ids = group['convo'].unique()
    
    # Initialize an empty list to store the counts of utterances
    utterance_counts = []
    
    # Iterate over each conversation ID
    for convo_id in convo_ids:
        # Get the conversation from the corpus
        conversation = corpus.get_conversation(convo_id)
        # If conversation is found, get the count of utterances, otherwise set count to 0
        if conversation:
            num_utterances = len(conversation.get_utterance_ids())
            utterance_counts.append(num_utterances)
        else:
            utterance_counts.append(0)
    
    # Add the utterance counts to the group DataFrame
    group['num_utterances'] = utterance_counts
    
    # Concatenate the processed group to the overall processed data
    processed_data = pd.concat([processed_data, group])

# Define the path to save the processed DataFrame
output_file_path = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/processed_4.csv"

# Save the processed DataFrame to a CSV file
processed_data.to_csv(output_file_path, index=False)

# Print a message confirming the save operation
print(f"Processed data saved to {output_file_path}")


Dataset already exists at /Users/admin/.convokit/downloads/subreddit-Amouranth


KeyError: "critikles: ooh I'd love to see Amouranth having a rap battle with herself in different cosplay outfits. TheMidnightGhostShow: Seconded. Johnblaze40: This was a good one . lol TheMidnightGhostShow: It is a phenomenal series, so many fun battles!  \nAnd who "

In [10]:
import pandas as pd

# Define file paths
gendered_utterances_file = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/gendered_utterances.csv"
processed_file = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/processed_4.csv"
output_file = "/Users/admin/Documents/University/diss-data-collection/Classifier/Sample/masterpsr.csv"

# Read the CSV files
gendered_utterances_df = pd.read_csv(gendered_utterances_file)
processed_df = pd.read_csv(processed_file)

# Merge the two DataFrames on the 'utterance_id' column
merged_df = pd.merge(gendered_utterances_df, processed_df[['utterance_id', 'num_utterances', 'conversation_id']], left_on='id', right_on='utterance_id', how='left')

# Drop the 'utterance_id' column (since it's redundant with 'id')
merged_df.drop(columns=['utterance_id'], inplace=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv(output_file, index=False)

print(f"Merged data saved to {output_file}")


Merged data saved to /Users/admin/Documents/University/diss-data-collection/Classifier/Sample/masterpsr.csv


In [6]:
# Define the path to the Downloads directory
downloads_dir = os.path.join(os.path.expanduser("~"), "Downloads")

# Define the path to the CSV file
csv_file_path = os.path.join(downloads_dir, "sampled_utterances.csv")

# Define the headers for the CSV file
headers = ["id", "text", "Parasocial Language"]

# Open the CSV file for writing
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as csv_file:
    # Create a CSV writer object
    writer = csv.writer(csv_file)
    
    # Write the headers to the CSV file
    writer.writerow(headers)
    
    # Write each sampled utterance to the CSV file
    for utterance in sampled_master_utterances:
        writer.writerow([utterance.id, utterance.text, ""])  # Empty string for "Parasocial Language" column

print(f"Sampled utterances saved to {csv_file_path}")

Sampled utterances saved to /Users/admin/Downloads/sampled_utterances.csv
