In [1]:
import praw
import pandas as pd
import json

In [13]:
config_file_path = "/path/to/reddit_config.json"
with open(config_file_path, "r") as f:
    reddit_config = json.load(f)

reddit = praw.Reddit(
    client_id=reddit_config["client_id"],
    client_secret=reddit_config["client_secret"],
    user_agent=reddit_config["user_agent"]
)

In [3]:
def scrape_subreddit(subreddit_name, keywords, limit=50, time_filter="all"):
    subreddit = reddit.subreddit(subreddit_name)
    posts_data = []

    for submission in subreddit.search(query, limit=limit):
        # Collect post details
        post_details = {
            "title": submission.title,
            "content": submission.selftext,
            "upvotes": submission.score,
            "num_comments": submission.num_comments,
            "url": submission.url,
            "comments": []
        }
        
        # Fetch top-level comments (up to 10)
        submission.comments.replace_more(limit=0)
        for comment in submission.comments[:10]:
            post_details["comments"].append(comment.body)

        posts_data.append(post_details)
    
    return pd.DataFrame(posts_data)

keywords = [
    "ChatGPT privacy AND developers",
    "How developers use LLMs for privacy",
    "Privacy concerns with AI tools",
    "Ethical AI for secure coding",
    "Large language models AND GDPR compliance",
    "Developers' privacy-related questions for ChatGPT",
    "AI tools for data-sensitive programming",
    "Privacy in AI-assisted development",
    "Confidential data and LLM applications",
    "Responsible AI for secure coding practices"
]


In [4]:
subreddits = ["privacy", "ChatGPT", "programming", "opensource"]
query = "ChatGPT OR LLM OR AI privacy"
limit = 10  # Number of posts per subreddit
dataframes = []

In [5]:
for subreddit_name in subreddits:
    print(f"Scraping subreddit: {subreddit_name}...")
    df = scrape_subreddit(subreddit_name, query, limit)
    df["subreddit"] = subreddit_name
    dataframes.append(df)

Scraping subreddit: privacy...
Scraping subreddit: ChatGPT...
Scraping subreddit: programming...
Scraping subreddit: opensource...


In [6]:
final_df = pd.concat(dataframes, ignore_index=True)

In [7]:
final_df.to_csv("reddit_llm_privacy_posts.csv", index=False)
print("Scraped data saved to 'reddit_llm_privacy_posts.csv'.")

Scraped data saved to 'reddit_llm_privacy_posts.csv'.


In [9]:
print(final_df.head())

                                               title  \
0                      ChatGPT/OpenAI privacy policy   
1  What are the Privacy Risks of using LLMs like ...   
2  Data privacy doubts regarding using commercial...   
3  LLMs are trained on your data, without you eve...   
4  Privacy for the rich. In a record setting pace...   

                                             content  upvotes  num_comments  \
0  ChatGPT privacy policy\n\nI've gone through th...        0             6   
1  It's a commonly held belief that LLMs are a pr...        1             6   
2  I've been working on integrating GPT3 API as w...        6             2   
3  I realized today as I was browsing my Quora's ...      358            37   
4                                                       13112           723   

                                                 url  \
0  https://www.reddit.com/r/privacy/comments/1g5f...   
1  https://www.reddit.com/r/privacy/comments/1d9m...   
2  https://www.reddi