<a href="https://colab.research.google.com/github/minyansh7/Terramare-AudienceResearch/blob/main/Insights_listening_Reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🛠️ Install dependencies
!pip install praw pandas

import praw
import re
import pandas as pd
from datetime import datetime, timedelta
import logging



In [None]:
# prompt: set Reddit_Client_ID
# Reddit_Secret in secrets, how to read it in colab

from google.colab import userdata
reddit_client_id = userdata.get('Reddit_Client_ID')
reddit_secret = userdata.get('Reddit_Secret')
user_agent = 'd-listening/0.1'

# Suppress PRAW's async environment warning
logging.getLogger("praw").setLevel(logging.ERROR)


In [None]:
# ✅ Setup Reddit connection
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_secret,
    user_agent=user_agent
)
# 🎯 Subreddits to scan
aussie_subs = [
    "meditation",       # Core subreddit
    "streamentry",      # Hardcore Buddhist meditation community
    "Buddhism",         # Vipassana and insight posts
    "Zen"               # R/zen — Koans, concentration, awareness
    "biohackers",       # NSDR, nootropics, breathwork
    "flowarts",         # Movement meditation, flow
    "selfimprovement"   # Habit-building overlap
]


# === 🇦🇺 Aussie context keywords (normalized) ===
aussie_terms = [t.lower() for t in [
    "australia", "aussie", "nsw", "vic", "centrelink", "medicare", "mygov",
    "woolies", "coles", "uni", "headspace", "black dog", "beyond blue",
    "tafe", "smiling mind", "r u ok", "NDIS", "QLD", "ACT", "myki", "VIC Health"
]]

# === 🧘 Meditation term pattern (fuzzy) ===
meditation_pattern = re.compile(r"\bmeditat(?:ion|ing|e)?\b", re.IGNORECASE)

# === 🔎 Scraping setup ===
max_items = 960
scraped = 0
results = []
after = int((datetime.utcnow() - timedelta(days=730)).timestamp())  # last 2 years

# === 🔁 Scrape posts ===
for sub in aussie_subs:
    if scraped >= max_items:
        break
    logging.info(f"🔍 Scanning posts in r/{sub}...")
    for post in reddit.subreddit(sub).search("meditat", sort="new", limit=None, params={"after": after}):
        if scraped >= max_items:
            break
        content = f"{post.title} {post.selftext}".lower()
        flair = (post.link_flair_text or "").lower()
        user_flair = (post.author_flair_text or "").lower()
        matched_terms = [term for term in aussie_terms if term in content or term in flair or term in user_flair]

        if meditation_pattern.search(content) and matched_terms and post.score >= 2:
            results.append({
                "type": "post",
                "subreddit": post.subreddit.display_name,
                "author": str(post.author),
                "score": post.score,
                "title": post.title,
                "full_text": post.selftext[:4000],
                "matched_terms": matched_terms,
                "url": f"https://reddit.com{post.permalink}",
                "created_utc": datetime.utcfromtimestamp(post.created_utc).isoformat(),
                "link_flair_text": post.link_flair_text,
                "num_comments": post.num_comments
            })
            scraped += 1

# === 💬 Scrape comments ===
for sub in aussie_subs:
    if scraped >= max_items:
        break
    logging.info(f"💬 Scanning comments in r/{sub}...")
    for comment in reddit.subreddit(sub).comments(limit=1000):
        if scraped >= max_items:
            break
        body = comment.body.lower()
        matched_terms = [term for term in aussie_terms if term in body]

        if meditation_pattern.search(body) and matched_terms and comment.score >= 2:
            results.append({
                "type": "comment",
                "subreddit": sub,
                "author": str(comment.author),
                "score": comment.score,
                "title": None,
                "full_text": comment.body[:4000],
                "matched_terms": matched_terms,
                "url": f"https://reddit.com{comment.permalink}",
                "created_utc": datetime.utcfromtimestamp(comment.created_utc).isoformat(),
                "link_flair_text": None,
                "comment_depth": getattr(comment, "depth", None)
            })
            scraped += 1

# === 📊 Output results ===
df = pd.DataFrame(results)
pd.set_option('display.max_colwidth', None)
print(f"\n✅ Scraped {len(df)} AU-relevant items mentioning meditation:\n")
print(df.head(10))


✅ Scraped 193 AU-relevant items mentioning meditation:

   type   subreddit                author  score  \
0  post  Meditation               Anos_17      3   
1  post  Meditation            AhadhXilal      5   
2  post  Meditation     NeitherEitherPuss      2   
3  post  Meditation  TartGroundbreaking38      2   
4  post  Meditation              Firizzer     13   
5  post  Meditation     EitherCartoonist1      7   
6  post  Meditation     TawniTotesSmashes    430   
7  post  Meditation              sickient    742   
8  post  Meditation            byteme8bit      2   
9  post  Meditation               fat5lut      2   

                                                                                                                                                                                                                                                                                                      title  \
0                                                                  

In [None]:
# ✅ Setup Reddit connection
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_secret,
    user_agent=user_agent
)
# 🎯 Subreddits to scan
aussie_subs = [
    "meditation",       # Core subreddit
    "streamentry",      # Hardcore Buddhist meditation community
    "Buddhism",         # Vipassana and insight posts
    "Zen",               # R/zen — Koans, concentration, awareness
    "biohackers",       # NSDR, nootropics, breathwork
    "flowarts",         # Movement meditation, flow
    "selfimprovement"   # Habit-building overlap
]


# === 🇦🇺 Aussie context keywords (normalized) ===
aussie_terms = [t.lower() for t in [
    "australia", "aussie", "nsw", "vic", "centrelink", "medicare", "mygov",
    "woolies", "coles", "uni", "headspace", "black dog", "beyond blue",
    "tafe", "smiling mind", "r u ok", "NDIS", "QLD", "ACT", "myki", "VIC Health"
]]

# === 🧘 Meditation term pattern (fuzzy) ===
meditation_pattern = re.compile(r"\bmeditat(?:ion|ing|e)?\b", re.IGNORECASE)

# === 🔎 Scraping setup ===
max_items = 2000
scraped_count = 0
results = []
# We will handle the date filtering differently depending on the method used

# Set a target timestamp for the last 2 years, but note that methods like .new()
# don't use 'after' with a timestamp like search does. We'll filter later.
# Time threshold
two_years_ago = datetime.utcnow() - timedelta(days=730)
two_years_ago_timestamp = int(two_years_ago.timestamp())

# === 🔁 Scrape Posts First (using .new() with time_filter or .top()) ===
# Trying .new() with 'year' time_filter first
logging.info("Phase 1: Scraping Posts...")
for sub in aussie_subs:
    if scraped_count >= max_items:
        break
    logging.info(f"🔍 Scanning recent posts in r/{sub}...")

    try:
        # Use .new() with time_filter 'year' to get recent posts within the last year
        # Note: This is approximate and doesn't guarantee exactly 2 years. We filter later.
        for post in reddit.subreddit(sub).new(limit=2000): # Increased limit per sub to find more candidates
            if scraped_count >= max_items:
                break

            # Apply time filter (explicitly check timestamp)
            if post.created_utc < two_years_ago_timestamp:
                continue # Skip if older than 2 years

            content = f"{post.title} {post.selftext}".lower()
            flair = (post.link_flair_text or "").lower()
            user_flair = (post.author_flair_text or "").lower()

            # Filter criteria: meditation keyword, Aussie context, score >= 2
            has_meditation_term = meditation_pattern.search(content) is not None
            matched_aussie_terms = [
                term for term in aussie_terms
                if re.search(rf"\b{re.escape(term)}\b", body, re.IGNORECASE)
            ]
            meets_score = post.score >= 2

            if has_meditation_term and matched_aussie_terms and meets_score:
                results.append({
                    "type": "post",
                    "subreddit": post.subreddit.display_name,
                    "author": str(post.author),
                    "score": post.score,
                    "title": post.title,
                    "full_text": post.selftext[:4000],
                    "matched_terms": matched_aussie_terms,
                    "url": f"https://reddit.com{post.permalink}",
                    "created_utc": datetime.utcfromtimestamp(post.created_utc).isoformat(),
                    "link_flair_text": post.link_flair_text,
                    "num_comments": post.num_comments,
                    "id": post.id # Store ID to fetch comments later
                })
                scraped_count += 1
                # logging.info(f"  Found post: {post.title} (Score: {post.score}, Scraped: {scraped_count})")

    except Exception as e:
        logging.error(f"Error scraping r/{sub} posts: {e}")


# === 💬 Scrape Comments from the Scraped Posts ===
# This is much more targeted and efficient than scraping all comments in a subreddit.
logging.info("\nPhase 2: Scraping Comments from Found Posts...")
post_ids_to_check_comments = [r['id'] for r in results if r['type'] == 'post'] # Get IDs of found posts

for post_id in post_ids_to_check_comments:
    if scraped_count >= max_items:
        break

    try:
        submission = reddit.submission(id=post_id)
        # Fetch all comments from the submission (Reddit may return a forest, not just top level)
        # Use replace_more(limit=0) to avoid fetching potentially huge amounts of 'MoreComments' objects
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()

        logging.info(f"  Checking {len(all_comments)} comments for post: {submission.title[:50]}...")

        for comment in all_comments:
            if scraped_count >= max_items:
                break

            # Skip if older than 2 years (check comment creation time)
            if comment.created_utc < two_years_ago_timestamp:
                continue # Skip if older than 2 years

            body = comment.body.lower()

            # Filter criteria: meditation keyword, Aussie context, score >= 2
            has_meditation_term = meditation_pattern.search(body) is not None
            matched_aussie_terms = [
                term for term in aussie_terms
                if re.search(rf"\b{re.escape(term)}\b", body, re.IGNORECASE)
            ]
            meets_score = comment.score >= 2

            if has_meditation_term and matched_aussie_terms and meets_score:
                results.append({
                    "type": "comment",
                    "subreddit": comment.subreddit.display_name,
                    "author": str(comment.author),
                    "score": comment.score,
                    "title": None, # Comments don't have titles
                    "full_text": comment.body[:4000],
                    "matched_terms": matched_aussie_terms,
                    "url": f"https://reddit.com{comment.permalink}",
                    "created_utc": datetime.utcfromtimestamp(comment.created_utc).isoformat(),
                    "link_flair_text": None, # Comments don't have link flair
                    "comment_depth": getattr(comment, "depth", None)
                })
                scraped_count += 1
                # logging.info(f"    Found comment (Score: {comment.score}, Scraped: {scraped_count})")

    except Exception as e:
        logging.error(f"Error scraping comments for post {post_id}: {e}")


# === 📊 Output results ===
df = pd.DataFrame(results)
# Ensure we only keep the required number of items if we exceeded max_items during the process
df = df.head(max_items)

# Sort by score or another relevant metric to potentially show higher quality results first
df = df.sort_values(by='score', ascending=False).reset_index(drop=True)


pd.set_option('display.max_colwidth', None)
print(f"\n✅ Scraped {len(df)} AU-relevant items mentioning meditation (target: {max_items}):\n")
print(df.head(10))

# Display some stats
print(f"\nTotal items scraped: {len(df)}")
print(f"Number of posts scraped: {len(df[df['type'] == 'post'])}")
print(f"Number of comments scraped: {len(df[df['type'] == 'comment'])}")
print(f"Median Score: {df['score'].median()}")
print(f"Average Score: {df['score'].mean():.2f}")
print(f"Items by Subreddit:\n{df['subreddit'].value_counts()}")



✅ Scraped 1366 AU-relevant items mentioning meditation (target: 2000):

   type        subreddit               author  score  \
0  post  selfimprovement           TrulyWacky   2831   
1  post       Meditation           Pranavtare   1903   
2  post       Meditation      KongeriketNorge    921   
3  post       Meditation    LilMissSunfloweer    889   
4  post       Meditation  Radiant-Candle-3290    635   
5  post  selfimprovement    Street_Break_2532    615   
6  post       Meditation         Karoliniskis    597   
7  post       Meditation              drabhin    550   
8  post       Meditation        Fly-Astronaut    532   
9  post         Buddhism           DharmaFool    404   

                                                                                                           title  \
0                                                                                                Poop in silence   
1  I finally committed to 30 days of unguided meditation, no apps, no fluff. J

In [None]:
# To optimise and hit the 1000 target:
# - Focus scraping on more targeted subreddits first.
# - Use the `subreddit.new()` or `subreddit.controversial()` or `subreddit.top()` methods with a time filter (`time_filter='year'` or `time_filter='all'`) instead of `search` for posts, as these can be more effective for recent activity within a time window.
# - For comments, instead of iterating *all* comments, iterate through the comments of the *scraped posts* that met the initial criteria. This keeps the comment search relevant to the found posts.
# - Increase the limits for initial post scraping to allow more potential candidates before filtering.
# - Implement a more dynamic approach that potentially increases the scrape depth or expands search criteria if the initial pass doesn't yield enough results.
# - Prioritize scoring potentially higher based on keywords or subreddits if needed.

In [None]:
import duckdb

# 🦆 Save to DuckDB
db_file = "reddit.duckdb"
conn = duckdb.connect(db_file)
# Drop the table if it exists
conn.execute("DROP TABLE IF EXISTS reddit_meditation_au")
print("✅ reddit_meditation_au table dropped if it existed.")

✅ reddit_meditation_au table dropped if it existed.


In [None]:
# 🧠 STEP 10: Save to DuckDB
import duckdb

# 🦆 Save to DuckDB
db_file = "reddit.duckdb"
conn = duckdb.connect(db_file)

# Create table if it doesn't exist
conn.execute("""
CREATE TABLE IF NOT EXISTS reddit_meditation_au AS
SELECT * FROM df WHERE FALSE
""")

# Append new data
conn.register("df", df)
conn.execute("INSERT INTO reddit_meditation_au SELECT * FROM df")

print(f"✅ Saved {len(df)} records to DuckDB → reddit.duckdb")


✅ Saved 1366 records to DuckDB → reddit.duckdb


In [None]:
preview = conn.execute("SELECT * FROM reddit_meditation_au ORDER BY score DESC LIMIT 5").fetchdf()
print(preview)


   type        subreddit               author  score  \
0  post  selfimprovement           TrulyWacky   2831   
1  post       Meditation           Pranavtare   1903   
2  post       Meditation      KongeriketNorge    921   
3  post       Meditation    LilMissSunfloweer    889   
4  post       Meditation  Radiant-Candle-3290    635   

                                                                                                           title  \
0                                                                                                Poop in silence   
1  I finally committed to 30 days of unguided meditation, no apps, no fluff. Just silence. Here’s what happened.   
2                                  I switched from mindfulness to humming meditation and now my anxiety vanished   
3                                I started sitting in silence for 10 minutes a day, no phone, no music, no tasks   
4                                                     I noticed I’ve been waiting t

In [None]:
#if you want to later use CrewAI or LangChain to load this data:
import duckdb

conn = duckdb.connect("reddit.duckdb")
data = conn.execute("SELECT * FROM reddit_meditation_au").fetchdf()