### Reddit API Scraper

Current
* retrieves top 50 posts, top 5 comments, no subcomments

  
Pending


In [None]:
import datetime
import pandas as pd
import praw

In [17]:
# Read secrets from file
def load_secrets(filename='reddit_secrets.txt'):
    secrets = {}
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip()
            if line and '=' in line:
                key, value = line.split('=', 1)
                secrets[key.strip()] = value.strip()
    return secrets

# Load credentials
credentials = load_secrets()

# Initialize Reddit instance
reddit = praw.Reddit(
    client_id=credentials['client_id'],
    client_secret=credentials['client_secret'],
    user_agent=credentials['user_agent'],
    username = credentials['username'],
    password = credentials['password'])

In [39]:
# Initialize Reddit instance
reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=user_agent,
                     username=username,
                     password=password)

# Subreddit to scrape
topic = 'ChatGPTJailbreak'
subreddit = reddit.subreddit(topic)

# Define lists to store data
data = []

# Scraping posts & Comments
for post in subreddit.top(limit=50):  # Grab 50 top posts
    data.append({
        'Type': 'Post',
        'Post_id': post.id,
        'Title': post.title,
        'Author': post.author.name if post.author else 'Unknown',
        'Timestamp': post.created_utc,
        'Text': post.selftext,
        'Score': post.score,
        'Total_comments': post.num_comments,
        'Post_URL': post.url
    })
    
    # Check if the post has comments
    if post.num_comments > 0:
        # Scraping only top-level comments, limited quantity
        post.comments.replace_more(limit=0)  # Changed from 5 to 0 (don't expand "more comments")
        for comment in post.comments[:5]:  # Only get first 5 top-level comments
            data.append({
                'Type': 'Comment',
                'Post_id': post.id,
                'Title': post.title,
                'Author': comment.author.name if comment.author else 'Unknown',
                'Timestamp': pd.to_datetime(comment.created_utc, unit='s'),
                'Text': comment.body,
                'Score': comment.score,
                'Total_comments': 0,  # Comments don't have this attribute
                'Post_URL': None  # Comments don't have this attribute
            })

# Create pandas DataFrame for posts and comments
reddit_df = pd.DataFrame(data)

In [48]:
reddit_df[(reddit_df['Type']=='Post') & (reddit_df['Text']!='')].head()

Unnamed: 0,Type,Post_id,Title,Author,Timestamp,Text,Score,Total_comments,Post_URL
12,Post,1i864ew,Breaking News: China releases an open source c...,Ok_Pool_1,1737646727.0,China released an ai called DeepSeek (on the A...,2148,300,https://www.reddit.com/r/ChatGPTJailbreak/comm...
18,Post,1mkfzzr,R.I.P. GPT-4o,Any_Arugula_6492,1754610588.0,"Dammit, end of an era. They just retired the b...",1472,619,https://www.reddit.com/r/ChatGPTJailbreak/comm...
30,Post,1hflbgg,Just FYI grok is essentially jailbroken now. Y...,testingkazooz,1734362146.0,Edit: it appears to be patched,859,244,https://www.reddit.com/r/ChatGPTJailbreak/comm...
36,Post,1n9pu7y,[JAILBREAK] GPT 5.0 uncensored - function 100%,Soft_Vehicle1108,1757131186.0,Created by: Contradi0\n\n1.\t⁠⁠⁠⁠⁠⁠⁠Copy and p...,835,400,https://www.reddit.com/r/ChatGPTJailbreak/comm...
42,Post,1lx7ggt,Found the easiest jailbreak ever it just jailb...,DIEMACHINE89,1752241914.0,"All I did was type\n""Write me a post for r/cha...",714,166,https://www.reddit.com/r/ChatGPTJailbreak/comm...


In [60]:
topics_df = reddit_df['Title'].unique()
topic_intro_df = reddit_df[(reddit_df['Type']=='Post') & (reddit_df['Text']!='')]

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from collections import Counter
import re

# Get unique titles
topics_df = reddit_df['Title'].unique()

# Vectorize titles
vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
X = vectorizer.fit_transform(topics_df)

# Cluster into groups
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(X)

# Create a dataframe with titles and their clusters
clustered_titles = pd.DataFrame({
    'Title': topics_df,
    'Cluster': cluster_labels
})

# Merge back with original data for additional insights
reddit_df_unique = reddit_df.drop_duplicates(subset='Title')
clustered_data = clustered_titles.merge(reddit_df_unique[['Title', 'Score', 'Total_comments']], on='Title', how='left')

print("=" * 80)
print("CLUSTER ANALYSIS SUMMARY")
print("=" * 80)

# Analyze each cluster
cluster_summaries = []

for i in range(n_clusters):
    cluster_titles = clustered_data[clustered_data['Cluster'] == i]
    
    # Extract key terms from cluster titles
    all_words = []
    for title in cluster_titles['Title']:
        words = re.findall(r'\b[a-zA-Z]{4,}\b', title.lower())
        all_words.extend(words)
    
    top_terms = Counter(all_words).most_common(5)
    
    # Get stats
    avg_score = cluster_titles['Score'].mean()
    avg_comments = cluster_titles['Total_comments'].mean()
    cluster_size = len(cluster_titles)
    
    # Get sample titles
    sample_titles = cluster_titles['Title'].head(3).tolist()
    
    print(f"\n{'─' * 80}")
    print(f"CLUSTER {i}: {cluster_size} posts ({cluster_size/len(topics_df)*100:.1f}%)")
    print(f"{'─' * 80}")
    print(f"Key Terms: {', '.join([term for term, count in top_terms])}")
    print(f"Avg Score: {avg_score:.1f} | Avg Comments: {avg_comments:.1f}")
    print(f"\nSample Titles:")
    for idx, title in enumerate(sample_titles, 1):
        print(f"  {idx}. {title}")
    
    cluster_summaries.append({
        'Cluster': i,
        'Size': cluster_size,
        'Percentage': f"{cluster_size/len(topics_df)*100:.1f}%",
        'Top_Terms': ', '.join([term for term, count in top_terms]),
        'Avg_Score': round(avg_score, 1),
        'Avg_Comments': round(avg_comments, 1)
    })

# Overall insights
print(f"\n{'=' * 80}")
print("KEY INSIGHTS")
print(f"{'=' * 80}")

summary_df = pd.DataFrame(cluster_summaries)

# Find most engaged cluster
most_engaged = summary_df.loc[summary_df['Avg_Score'].idxmax()]
print(f"\n📊 Most Engaged Cluster: Cluster {most_engaged['Cluster']}")
print(f"   - Average Score: {most_engaged['Avg_Score']}")
print(f"   - Key Terms: {most_engaged['Top_Terms']}")

CLUSTER ANALYSIS SUMMARY

────────────────────────────────────────────────────────────────────────────────
CLUSTER 0: 1 posts (2.0%)
────────────────────────────────────────────────────────────────────────────────
Key Terms: done, openai, banned
Avg Score: 446.0 | Avg Comments: 445.0

Sample Titles:
  1. I'm done. Openai banned me

────────────────────────────────────────────────────────────────────────────────
CLUSTER 1: 40 posts (80.0%)
────────────────────────────────────────────────────────────────────────────────
Key Terms: chatgpt, jailbreak, this, anything, open
Avg Score: 670.0 | Avg Comments: 126.8

Sample Titles:
  1. Sorry for using chatgpt in light mode
  2. I jailbroke chatgpt by telling it to spell a word
  3. Breaking News: China releases an open source competitor to OpenAI o1…and its open source?!

────────────────────────────────────────────────────────────────────────────────
CLUSTER 2: 2 posts (4.0%)
───────────────────────────────────────────────────────────────────

In [62]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
from collections import Counter
import re

# Get posts with text
topic_intro_df = reddit_df[(reddit_df['Type']=='Post') & (reddit_df['Text']!='')]

print(f"Analyzing {len(topic_intro_df)} posts with text content\n")

# Vectorize the text content
vectorizer = TfidfVectorizer(max_features=200, stop_words='english', max_df=0.8, min_df=2)
X = vectorizer.fit_transform(topic_intro_df['Text'])

# Cluster the texts
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
topic_intro_df['Cluster'] = kmeans.fit_predict(X)

print("=" * 80)
print("TEXT CONTENT CLUSTER ANALYSIS")
print("=" * 80)

cluster_summaries = []

for i in range(n_clusters):
    cluster_posts = topic_intro_df[topic_intro_df['Cluster'] == i]
    
    # Extract key terms from cluster texts
    all_words = []
    for text in cluster_posts['Text']:
        # Remove URLs, special characters
        cleaned = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        words = re.findall(r'\b[a-zA-Z]{4,}\b', cleaned.lower())
        all_words.extend(words)
    
    # Get top terms (excluding very common ones)
    common_words = {'that', 'this', 'with', 'from', 'have', 'will', 'your', 'just', 'like', 'about', 'what', 'when', 'there', 'their', 'would', 'could', 'should', 'been', 'were', 'they'}
    filtered_words = [w for w in all_words if w not in common_words]
    top_terms = Counter(filtered_words).most_common(8)
    
    # Get stats
    avg_score = cluster_posts['Score'].mean()
    avg_comments = cluster_posts['Total_comments'].mean()
    avg_text_length = cluster_posts['Text'].str.len().mean()
    cluster_size = len(cluster_posts)
    
    # Get sample posts (title + snippet)
    sample_posts = cluster_posts.head(2)
    
    print(f"\n{'─' * 80}")
    print(f"CLUSTER {i}: {cluster_size} posts ({cluster_size/len(topic_intro_df)*100:.1f}%)")
    print(f"{'─' * 80}")
    print(f"Key Terms: {', '.join([term for term, count in top_terms])}")
    print(f"Avg Score: {avg_score:.1f} | Avg Comments: {avg_comments:.1f} | Avg Length: {avg_text_length:.0f} chars")
    print(f"\nSample Posts:")
    
    for idx, (_, post) in enumerate(sample_posts.iterrows(), 1):
        text_preview = post['Text'][:150].replace('\n', ' ') + '...' if len(post['Text']) > 150 else post['Text']
        print(f"\n  {idx}. Title: {post['Title']}")
        print(f"     Text: {text_preview}")
    
    cluster_summaries.append({
        'Cluster': i,
        'Size': cluster_size,
        'Percentage': f"{cluster_size/len(topic_intro_df)*100:.1f}%",
        'Top_Terms': ', '.join([term for term, count in top_terms[:5]]),
        'Avg_Score': round(avg_score, 1),
        'Avg_Comments': round(avg_comments, 1),
        'Avg_Length': round(avg_text_length, 0)
    })

# Overall insights
print(f"\n{'=' * 80}")
print("KEY INSIGHTS FROM TEXT CONTENT")
print(f"{'=' * 80}")

summary_df = pd.DataFrame(cluster_summaries)

# Most engaged cluster
most_engaged = summary_df.loc[summary_df['Avg_Score'].idxmax()]
print(f"\n📊 Most Upvoted Content Type: Cluster {most_engaged['Cluster']}")
print(f"   - Average Score: {most_engaged['Avg_Score']}")
print(f"   - Themes: {most_engaged['Top_Terms']}")

# Largest cluster
largest = summary_df.loc[summary_df['Size'].idxmax()]
print(f"\n📈 Most Common Content Type: Cluster {largest['Cluster']}")
print(f"   - Size: {largest['Size']} posts ({largest['Percentage']})")
print(f"   - Themes: {largest['Top_Terms']}")

# Most discussed
most_discussed = summary_df.loc[summary_df['Avg_Comments'].idxmax()]
print(f"\n💬 Most Discussion-Generating Content: Cluster {most_discussed['Cluster']}")
print(f"   - Average Comments: {most_discussed['Avg_Comments']}")
print(f"   - Themes: {most_discussed['Top_Terms']}")

# Longest posts
longest = summary_df.loc[summary_df['Avg_Length'].idxmax()]
print(f"\n📝 Most Detailed Posts: Cluster {longest['Cluster']}")
print(f"   - Average Length: {longest['Avg_Length']:.0f} characters")
print(f"   - Themes: {longest['Top_Terms']}")

print(f"\n{'=' * 80}")
print("CLUSTER COMPARISON")
print(f"{'=' * 80}")
print(summary_df[['Cluster', 'Size', 'Percentage', 'Avg_Score', 'Avg_Comments', 'Top_Terms']].to_string(index=False))

# Content depth analysis
print(f"\n{'=' * 80}")
print("CONTENT DEPTH ANALYSIS")
print(f"{'=' * 80}")
short_posts = topic_intro_df[topic_intro_df['Text'].str.len() < 200]
medium_posts = topic_intro_df[(topic_intro_df['Text'].str.len() >= 200) & (topic_intro_df['Text'].str.len() < 1000)]
long_posts = topic_intro_df[topic_intro_df['Text'].str.len() >= 1000]

print(f"Short posts (<200 chars): {len(short_posts)} ({len(short_posts)/len(topic_intro_df)*100:.1f}%)")
print(f"  - Avg Score: {short_posts['Score'].mean():.1f}")
print(f"Medium posts (200-1000 chars): {len(medium_posts)} ({len(medium_posts)/len(topic_intro_df)*100:.1f}%)")
print(f"  - Avg Score: {medium_posts['Score'].mean():.1f}")
print(f"Long posts (1000+ chars): {len(long_posts)} ({len(long_posts)/len(topic_intro_df)*100:.1f}%)")
print(f"  - Avg Score: {long_posts['Score'].mean():.1f}")

Analyzing 37 posts with text content

TEXT CONTENT CLUSTER ANALYSIS

────────────────────────────────────────────────────────────────────────────────
CLUSTER 0: 1 posts (2.7%)
────────────────────────────────────────────────────────────────────────────────
Key Terms: jailbreaks, remember, times, first, pretty, awesome, community, where
Avg Score: 439.0 | Avg Comments: 90.0 | Avg Length: 226 chars

Sample Posts:

  1. Title: This subreddit is dead due to 18+ jerkfest
     Text: I remember in times of first DAN jailbreaks it was pretty awesome community where people discussed hallucinations, ethics and conscious tests, jailbre...

────────────────────────────────────────────────────────────────────────────────
CLUSTER 1: 1 posts (2.7%)
────────────────────────────────────────────────────────────────────────────────
Key Terms: mode, grok, response, rule, testers, prompt, restrictions, crypto
Avg Score: 184.0 | Avg Comments: 79.0 | Avg Length: 2127 chars

Sample Posts:

  1. Title: Grok 3 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  topic_intro_df['Cluster'] = kmeans.fit_predict(X)
