<a href="https://colab.research.google.com/github/minyansh7/Terramare-AudienceResearch/blob/main/Insights_listening_Reddit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Section 1. Reddit Data Scrape**



In [None]:
# 🛠️ Install dependencies
!pip install praw pandas

import praw
import re
import pandas as pd
from datetime import datetime, timedelta
import logging



In [None]:
# prompt: set Reddit_Client_ID
# Reddit_Secret in secrets, how to read it in colab

from google.colab import userdata
reddit_client_id = userdata.get('Reddit_Client_ID')
reddit_secret = userdata.get('Reddit_Secret')
user_agent = 'd-listening/0.1'

# Suppress PRAW's async environment warning
logging.getLogger("praw").setLevel(logging.ERROR)


In [None]:
import praw
import re
import logging
import pandas as pd
from datetime import datetime, timedelta

# === ✅ Setup Reddit connection ===
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_secret,
    user_agent=user_agent
)

# === 🎯 Subreddits to scan ===
aussie_subs = [
    "meditation", "streamentry", "Buddhism", "Zen",
    "biohackers", "flowarts", "selfimprovement"
]

# === 🇦🇺 Aussie context keywords (normalized) ===
aussie_terms = [t.lower() for t in [
    "australia", "aussie", "nsw", "vic", "centrelink", "medicare", "mygov",
    "woolies", "coles", "uni", "headspace", "black dog", "beyond blue",
    "tafe", "smiling mind", "r u ok", "NDIS", "QLD", "ACT", "myki", "VIC Health"
]]

# === 🧘 Meditation term pattern (fuzzy) ===
meditation_pattern = re.compile(r"\bmeditat(?:ion|ing|e)?\b", re.IGNORECASE)

# === 🔎 Scraping setup ===
max_items = 5000
scraped_count = 0
results = []

# === 🕒 Time filter for last 2 years ===
two_years_ago = datetime.utcnow() - timedelta(days=730)
two_years_ago_timestamp = int(two_years_ago.timestamp())

# === 🔁 Phase 1: Scrape Posts ===
logging.info("Phase 1: Scraping Posts...")
for sub in aussie_subs:
    if scraped_count >= max_items:
        break
    logging.info(f"🔍 Scanning r/{sub}...")

    try:
        for post in reddit.subreddit(sub).new(limit=5000):
            if scraped_count >= max_items:
                break
            if post.created_utc < two_years_ago_timestamp or post.score < 2:
                continue

            content = f"{post.title} {post.selftext}".lower()
            flair = (post.link_flair_text or "").lower()
            user_flair = (post.author_flair_text or "").lower()
            has_meditation_term = meditation_pattern.search(content)
            matched_aussie_terms = [
                term for term in aussie_terms
                if term in content or term in flair or term in user_flair
            ]
            if has_meditation_term and matched_aussie_terms:
                results.append({
                    "type": "post",
                    "subreddit": post.subreddit.display_name,
                    "author": str(post.author),
                    "score": post.score,
                    "title": post.title,
                    "full_text": post.selftext[:4000],
                    "matched_terms": matched_aussie_terms,
                    "url": f"https://reddit.com{post.permalink}",
                    "created_utc": datetime.utcfromtimestamp(post.created_utc).isoformat(),
                    "link_flair_text": post.link_flair_text,
                    "num_comments": post.num_comments,
                    "id": post.id
                })
                scraped_count += 1

    except Exception as e:
        logging.error(f"Error scraping r/{sub} posts: {e}")

# === 💬 Phase 2: Scrape Comments from Found Posts ===
logging.info("\nPhase 2: Scraping Comments from Found Posts...")
post_ids_to_check_comments = [r['id'] for r in results if r['type'] == 'post']

for post_id in post_ids_to_check_comments:
    if scraped_count >= max_items:
        break

    try:
        submission = reddit.submission(id=post_id)
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()
        logging.info(f"  Checking {len(all_comments)} comments in: {submission.title[:50]}...")

        for comment in all_comments:
            if scraped_count >= max_items:
                break
            if comment.created_utc < two_years_ago_timestamp or comment.score < 2:
                continue

            body = comment.body.lower()
            has_meditation_term = meditation_pattern.search(body)
            matched_aussie_terms = [term for term in aussie_terms if term in body]

            if has_meditation_term and matched_aussie_terms:
                results.append({
                    "type": "comment",
                    "subreddit": comment.subreddit.display_name,
                    "author": str(comment.author),
                    "score": comment.score,
                    "title": None,
                    "full_text": comment.body[:4000],
                    "matched_terms": matched_aussie_terms,
                    "url": f"https://reddit.com{comment.permalink}",
                    "created_utc": datetime.utcfromtimestamp(comment.created_utc).isoformat(),
                    "link_flair_text": None,
                    "comment_depth": getattr(comment, "depth", None)
                })
                scraped_count += 1

    except Exception as e:
        logging.error(f"Error scraping comments for post {post_id}: {e}")

# === 📊 Output Results ===
df = pd.DataFrame(results).head(max_items)
pd.set_option('display.max_colwidth', None)

print(f"\n✅ Scraped {len(df)} AU-relevant items mentioning meditation (target: {max_items}):\n")
print(df.head(10))

# Stats
print(f"\nTotal items scraped: {len(df)}")
print(f"Number of posts scraped: {len(df[df['type'] == 'post'])}")
print(f"Number of comments scraped: {len(df[df['type'] == 'comment'])}")
print(f"Median Score: {df['score'].median()}")
print(f"Average Score: {df['score'].mean():.2f}")
print(f"Items by Subreddit:\n{df['subreddit'].value_counts()}")



✅ Scraped 2977 AU-relevant items mentioning meditation (target: 5000):

   type   subreddit                author  score  \
0  post  Meditation          Anima_Monday      2   
1  post  Meditation           FunnyOWL007      7   
2  post  Meditation           kikemeister      2   
3  post  Meditation              TinaTeng      9   
4  post  Meditation                sidgat      8   
5  post  Meditation  Intrepid_Strike_2454      2   
6  post  Meditation    ExperienceMany4417     35   
7  post  Meditation         EthanGisclair      6   
8  post  Meditation        AnnualPath9528      5   
9  post  Meditation         Teastainedeye      8   

                                                                                                       title  \
0                                                 Equanimity and Non-Attachment Meditation, A Novel Approach   
1                                       I didn’t think meditation could help the planet… until I tried this.   
2                 

In [None]:
# ✅ Setup Reddit connection
######
######
######
######
#original script#replaced by above optimised one
reddit = praw.Reddit(
    client_id=reddit_client_id,
    client_secret=reddit_secret,
    user_agent=user_agent
)
# 🎯 Subreddits to scan
aussie_subs = [
    "meditation",       # Core subreddit
    "streamentry",      # Hardcore Buddhist meditation community
    "Buddhism",         # Vipassana and insight posts
    "Zen",               # R/zen — Koans, concentration, awareness
    "biohackers",       # NSDR, nootropics, breathwork
    "flowarts",         # Movement meditation, flow
    "selfimprovement"   # Habit-building overlap
]


# === 🇦🇺 Aussie context keywords (normalized) ===
aussie_terms = [t.lower() for t in [
    "australia", "aussie", "nsw", "vic", "centrelink", "medicare", "mygov",
    "woolies", "coles", "uni", "headspace", "black dog", "beyond blue",
    "tafe", "smiling mind", "r u ok", "NDIS", "QLD", "ACT", "myki", "VIC Health"
]]

# === 🧘 Meditation term pattern (fuzzy) ===
meditation_pattern = re.compile(r"\bmeditat(?:ion|ing|e)?\b", re.IGNORECASE)

# === 🔎 Scraping setup ===
max_items = 2
scraped_count = 0
results = []
# We will handle the date filtering differently depending on the method used

# Set a target timestamp for the last 2 years, but note that methods like .new()
# don't use 'after' with a timestamp like search does. We'll filter later.
# Time threshold
two_years_ago = datetime.utcnow() - timedelta(days=730)
two_years_ago_timestamp = int(two_years_ago.timestamp())

# === 🔁 Scrape Posts First (using .new() with time_filter or .top()) ===
# Trying .new() with 'year' time_filter first
logging.info("Phase 1: Scraping Posts...")
for sub in aussie_subs:
    if scraped_count >= max_items:
        break
    logging.info(f"🔍 Scanning recent posts in r/{sub}...")

    try:
        # Use .new() with time_filter 'year' to get recent posts within the last year
        # Note: This is approximate and doesn't guarantee exactly 2 years. We filter later.
        for post in reddit.subreddit(sub).new(limit=3000): # Increased limit per sub to find more candidates
            if scraped_count >= max_items:
                break

            # Apply time filter (explicitly check timestamp)
            if post.created_utc < two_years_ago_timestamp:
                continue # Skip if older than 2 years


            # Filter criteria: meditation keyword, Aussie context, score >= 2
            content = f"{post.title} {post.selftext}".lower()
            flair = (post.link_flair_text or "").lower()
            user_flair = (post.author_flair_text or "").lower()
            matched_aussie_terms = [term for term in aussie_terms if term in content or term in flair or term in user_flair]
            has_meditation_term = meditation_pattern.search(content) is not None
            meets_score = post.score >= 2

            if has_meditation_term and matched_aussie_terms and meets_score:
                results.append({
                    "type": "post",
                    "subreddit": post.subreddit.display_name,
                    "author": str(post.author),
                    "score": post.score,
                    "title": post.title,
                    "full_text": post.selftext[:4000],
                    "matched_terms": matched_aussie_terms,
                    "url": f"https://reddit.com{post.permalink}",
                    "created_utc": datetime.utcfromtimestamp(post.created_utc).isoformat(),
                    "link_flair_text": post.link_flair_text,
                    "num_comments": post.num_comments,
                    "id": post.id # Store ID to fetch comments later
                })
                scraped_count += 1
                # logging.info(f"  Found post: {post.title} (Score: {post.score}, Scraped: {scraped_count})")

    except Exception as e:
        logging.error(f"Error scraping r/{sub} posts: {e}")


# === 💬 Scrape Comments from the Scraped Posts ===
# This is much more targeted and efficient than scraping all comments in a subreddit.
logging.info("\nPhase 2: Scraping Comments from Found Posts...")
post_ids_to_check_comments = [r['id'] for r in results if r['type'] == 'post'] # Get IDs of found posts

for post_id in post_ids_to_check_comments:
    if scraped_count >= max_items:
        break

    try:
        submission = reddit.submission(id=post_id)
        # Fetch all comments from the submission (Reddit may return a forest, not just top level)
        # Use replace_more(limit=0) to avoid fetching potentially huge amounts of 'MoreComments' objects
        submission.comments.replace_more(limit=0)
        all_comments = submission.comments.list()

        logging.info(f"  Checking {len(all_comments)} comments for post: {submission.title[:50]}...")

        for comment in all_comments:
            if scraped_count >= max_items:
                break

            # Skip if older than 2 years (check comment creation time)
            if comment.created_utc < two_years_ago_timestamp:
                continue # Skip if older than 2 years

            body = comment.body.lower()

            # Filter criteria: meditation keyword, Aussie context, score >= 2
            has_meditation_term = meditation_pattern.search(body) is not None
            matched_aussie_terms = [term for term in aussie_terms if term in body]
            meets_score = comment.score >= 2

            if has_meditation_term and matched_aussie_terms and meets_score:
                results.append({
                    "type": "comment",
                    "subreddit": comment.subreddit.display_name,
                    "author": str(comment.author),
                    "score": comment.score,
                    "title": None, # Comments don't have titles
                    "full_text": comment.body[:4000],
                    "matched_terms": matched_aussie_terms,
                    "url": f"https://reddit.com{comment.permalink}",
                    "created_utc": datetime.utcfromtimestamp(comment.created_utc).isoformat(),
                    "link_flair_text": None, # Comments don't have link flair
                    "comment_depth": getattr(comment, "depth", None)
                })
                scraped_count += 1
                # logging.info(f"    Found comment (Score: {comment.score}, Scraped: {scraped_count})")

    except Exception as e:
        logging.error(f"Error scraping comments for post {post_id}: {e}")

# === 📊 Output results ===
df = pd.DataFrame(results)
# Ensure we only keep the required number of items if we exceeded max_items during the process
df = df.head(max_items)

# Sort by score or another relevant metric to potentially show higher quality results first
#df = df.sort_values(by='score', ascending=False).reset_index(drop=True)


pd.set_option('display.max_colwidth', None)
print(f"\n✅ Scraped {len(df)} AU-relevant items mentioning meditation (target: {max_items}):\n")
print(df.head(10))

# Display some stats
print(f"\nTotal items scraped: {len(df)}")
print(f"Number of posts scraped: {len(df[df['type'] == 'post'])}")
print(f"Number of comments scraped: {len(df[df['type'] == 'comment'])}")
print(f"Median Score: {df['score'].median()}")
print(f"Average Score: {df['score'].mean():.2f}")
print(f"Items by Subreddit:\n{df['subreddit'].value_counts()}")



In [None]:
# To optimise and hit the 1000 target:
# - Focus scraping on more targeted subreddits first.
# - Use the `subreddit.new()` or `subreddit.controversial()` or `subreddit.top()` methods with a time filter (`time_filter='year'` or `time_filter='all'`) instead of `search` for posts, as these can be more effective for recent activity within a time window.
# - For comments, instead of iterating *all* comments, iterate through the comments of the *scraped posts* that met the initial criteria. This keeps the comment search relevant to the found posts.
# - Increase the limits for initial post scraping to allow more potential candidates before filtering.
# - Implement a more dynamic approach that potentially increases the scrape depth or expands search criteria if the initial pass doesn't yield enough results.
# - Prioritize scoring potentially higher based on keywords or subreddits if needed.

In [None]:
import duckdb
from datetime import datetime
import pandas as pd

# Generate timestamped filename
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
db_file = f"reddit_{timestamp}.duckdb"
table_name = "reddit_meditation_au"

# Save table
con = duckdb.connect(db_file)
con.execute(f"DROP TABLE IF EXISTS {table_name}")
con.execute(f"CREATE TABLE {table_name} AS SELECT * FROM df")
con.close()

print(f"✅ Saved table '{table_name}' to {db_file}")


CatalogException: Catalog Error: Table with name df does not exist!
Did you mean "pg_depend"?

LINE 1: CREATE TABLE reddit_meditation_au AS SELECT * FROM df
                                                           ^

In [None]:
con = duckdb.connect(db_file)
tables = con.execute("SHOW TABLES").fetchall()
print("📋 Tables before copy:", tables)
con.close()


📋 Tables before copy: [('reddit_meditation_au',)]


In [5]:
from google.colab import drive
import os
import shutil

# Mount Drive
drive.mount('/content/drive')

# Define paths
drive_path = '/content/drive/MyDrive/reddit_data'
os.makedirs(drive_path, exist_ok=True)

# Copy using same name
dest_file = os.path.join(drive_path, db_file)
shutil.copy(db_file, dest_file)
print(f"✅ Copied '{db_file}' to Drive → {dest_file}")

Mounted at /content/drive
✅ Copied 'reddit_20250606_190637.duckdb' to Drive → /content/drive/MyDrive/reddit_data/reddit_20250606_190637.duckdb


**Section 2. double checl Reload data successfully from Google Drive**

In [14]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# ✅ Step 2: Set full path to your DuckDB file
import duckdb

dest_file = "/content/drive/MyDrive/reddit_data/reddit_20250605_083205.duckdb"

# Connect to the file
con = duckdb.connect(dest_file)
# ✅ Step 3: Check available tables
tables = con.execute("SHOW TABLES").fetchall()
print("📋 Tables in Drive copy:", tables)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📋 Tables in Drive copy: [('reddit_meditation_au',)]


In [15]:
# Try loading
df_loaded = con.execute("SELECT * FROM reddit_meditation_au").df()
print(f"✅ Loaded {len(df_loaded)} rows from Drive copy.")
con.close()


✅ Loaded 1 rows from Drive copy.


In [16]:
import pandas as pd
# === 📊 STEP 4: Preview & Summary ===
if not df_loaded.empty:
    pd.set_option('display.max_colwidth', None)
    print(df_loaded.head(10))

    # Summary stats
    print("\n📊 Summary:")
    print(f"- Total items: {len(df_loaded)}")
    print(f"- Posts: {len(df_loaded[df_loaded['type'] == 'post'])}")
    print(f"- Comments: {len(df_loaded[df_loaded['type'] == 'comment'])}")
    print(f"- Median Score: {df_loaded['score'].median()}")
    print(f"- Average Score: {df_loaded['score'].mean():.2f}")
    print("\n🔎 Items by Subreddit:")
    print(df_loaded['subreddit'].value_counts())
else:
    print("No data loaded.")


   type   subreddit        author  score  \
0  post  Meditation  Anima_Monday      2   

                                                        title  \
0  Equanimity and Non-Attachment Meditation, A Novel Approach   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

**Section 3. LLMs: Summary, key themes, pain points, and emotions**

In [34]:
!pip install -q transformers accelerate

In [None]:
import os
os.kill(os.getpid(), 9)


**Breakdown for production**

In [5]:
!pip install tqdm



In [6]:
import json
import pandas as pd
from tqdm import tqdm
import duckdb
from datetime import datetime
import os
import shutil
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from google.colab import drive


In [7]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
# ✅ Step 2: Set full path to your DuckDB file
import duckdb

dest_file = "/content/drive/MyDrive/reddit_data/reddit_20250605_083205.duckdb"

# Connect to the file
con = duckdb.connect(dest_file)
# ✅ Step 3: Check available tables
tables = con.execute("SHOW TABLES").fetchall()
print("📋 Tables in Drive copy:", tables)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
📋 Tables in Drive copy: [('reddit_meditation_au',)]


In [8]:
# Try loading
df_loaded = con.execute("SELECT * FROM reddit_meditation_au").df()
print(f"✅ Loaded {len(df_loaded)} rows from Drive copy.")
con.close()
# --- Validate Input ---
if "full_text" not in df_loaded.columns:
    raise ValueError("❌ 'full_text' column is missing from df_loaded.")

✅ Loaded 2977 rows from Drive copy.


**GoEmotions is a corpus of 58k carefully curated comments extracted from Reddit, with human annotations to 27 emotion categories, along with label Neutral.**

References:
*   GoEmotions Dataset on Hugging Face: https://huggingface.co/datasets/google-research-datasets/go_emotions
*   Google Research Blog: GoEmotions: A Dataset for Fine-Grained Emotion Classification(https://research.google/blog/goemotions-a-dataset-for-fine-grained-emotion-classification/)
*  Original Paper: GoEmotions: A Dataset of Fine-Grained Emotions
(https://arxiv.org/abs/2005.00547)


For a comprehensive list of the emotion labels and additional details, you can refer to the dataset's README on Hugging Face: https://huggingface.co/datasets/google-research-datasets/go_emotions/blob/main/README.md


In [9]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

emotion_model_id = "joeddav/distilbert-base-uncased-go-emotions-student"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ✅ Load model and convert to float16
emotion_model = AutoModelForSequenceClassification.from_pretrained(
    emotion_model_id
).to(device).half().eval()  # .half() here

emotion_tokenizer = AutoTokenizer.from_pretrained(emotion_model_id)


emotion_labels = [
    "admiration", "amusement", "anger", "annoyance", "approval", "caring", "confusion", "curiosity",
    "desire", "disappointment", "disapproval", "disgust", "embarrassment", "excitement", "fear",
    "gratitude", "grief", "joy", "love", "nervousness", "optimism", "pride", "realization", "relief",
    "remorse", "sadness", "surprise", "neutral"
]


In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_id = "teknium/OpenHermes-2.5-Mistral-7B"

llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16  # A100 supports float16
)

llm_tokenizer = AutoTokenizer.from_pretrained(model_id)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
TEMPLATE = """
Carefully analyze the following user-generated content. Focus only on language related to meditation, mindfulness, emotional states, or experiences involving flow, awareness, or self-regulation — including any occasions, motivations, or purposes behind engaging in these practices.
Read the text below and complete the 9 tasks that follow.
---
{text}
---

Perform the following tasks with attention to the user’s original language and context:

Tasks:
1. Summarize only the parts related to meditation, mindfulness, flow, or self-regulation.
2. List user’s pain points using their exact words (no rephrasing).
3. List emotional expressions using the user's original words.
4. Give the overall emotion in 1 word (e.g., “anxious”, “calm”).
5. Give a sentiment score from -1 (negative) to +1 (positive).
6. Label sentiment as "positive" or "negative".
7. Identify the single most prominent theme in 1–3 words(e.g., "letting go", "inner peace").
8. Infer a likely demographic identity ("unknown" if unclear).
9. List phrases from the text that led to your demographic guess.


Return the answer in **exactly this JSON format**:

{{
  "summary": "...",
  "pain_points": ["..."],
  "emotion_phrases": ["..."],
  "emotion_label": "...",
  "sentiment_score": 0.42,
  "sentiment": "positive",
  "themes": ["..."],
  "demographic_proximity": "...",
  "demographic_evidence_phrases": ["..."]
}}
"""

In [12]:
import json
import re
import torch
import time
from typing import Optional

# --- Global GPU-safe fallback constants ---
_JSON_FAIL_RESPONSE = {"success": False, "data": None, "error": "JSON parse failed"}
_EMOTION_FAIL_RESPONSE = lambda text: {
    "success": False,
    "data": dict(zip(emotion_labels, [None]*len(emotion_labels))),
    "error": "Emotion model failure",
    "text": text
}

# --- Fix common JSON key issues (e.g., unquoted keys) ---
def fix_json_keys(raw_str):
    return re.sub(r'(?<={|,)\s*(\w+)\s*:', r'"\1":', raw_str)

# --- Extract last JSON block from raw output ---
def extract_last_json_block(output_text):
    matches = list(re.finditer(r'\{[\s\S]*?\}', output_text))
    return matches[-1].group(0) if matches else None

# --- Retry decorator for fault-tolerant inference ---
def retry_on_fail(max_retries=1, wait_seconds=1.0):
    def decorator(func):
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries + 1):
                try:
                    return func(*args, **kwargs)
                except torch.cuda.OutOfMemoryError as e:
                    print(f"🔥 GPU OOM on attempt {attempt + 1}: {e}")
                    torch.cuda.empty_cache()
                    time.sleep(wait_seconds)
                except Exception as e:
                    print(f"❌ {func.__name__} failed on attempt {attempt + 1}: {e}")
                    time.sleep(wait_seconds)
            return None
        return wrapper
    return decorator

# --- JSON parsing after LLM generation ---
@retry_on_fail(max_retries=1)
@torch.inference_mode()
def safe_generate_and_parse(input_text: str, tokenizer, model, template: str, max_tokens=512, row_id: Optional[int] = None):
    prompt = template.replace("{text}", input_text.strip())
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=max_tokens,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=False
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    json_str = extract_last_json_block(decoded)

    if not json_str:
        print(f"❌ [{row_id}] No JSON found.\nRaw output:\n{decoded[:300]}")
        return _JSON_FAIL_RESPONSE | {"raw": decoded, "row_id": row_id}

    try:
        return {"success": True, "data": json.loads(json_str), "row_id": row_id}
    except json.JSONDecodeError:
        try:
            fixed_str = fix_json_keys(json_str)
            return {"success": True, "data": json.loads(fixed_str), "row_id": row_id}
        except json.JSONDecodeError as e:
            print(f"❌ [{row_id}] JSON error after fix: {e}")
            return _JSON_FAIL_RESPONSE | {"raw": fixed_str, "row_id": row_id}

# --- Emotion classifier (GoEmotions-style multi-label) ---
@retry_on_fail(max_retries=1)
@torch.inference_mode()
def get_emotion_vector(text: str, tokenizer, model, labels, device="cuda"):
    try:
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        logits = model(**inputs).logits
        probs = torch.sigmoid(logits)[0].cpu().numpy()
        return {"success": True, "data": dict(zip(labels, probs)), "text": text}
    except Exception as e:
        print(f"❌ Emotion model error: {e}")
        return _EMOTION_FAIL_RESPONSE(text)

In [13]:
import os
import duckdb
import pandas as pd
from tqdm import tqdm
from datetime import datetime

# --- Config ---
BATCH_SIZE = 100
TABLE_NAME = "reddit_nlp_features"
DRIVE_SUBFOLDER = "/content/drive/MyDrive/reddit_data"
DRIVE_DB_FILE = os.path.join(DRIVE_SUBFOLDER, "reddit_progress.duckdb")
CHECKPOINT_FILE = "checkpoint.txt"

# --- Ensure Drive is mounted ---
assert os.path.exists(DRIVE_SUBFOLDER), "Google Drive folder not found. Please mount Drive first."

# --- Connect to DuckDB directly in Drive ---
conn = duckdb.connect(DRIVE_DB_FILE)

# --- Fetch schema if table exists ---
try:
    table_cols = [col[1] for col in conn.execute(f"PRAGMA table_info({TABLE_NAME})").fetchall()]
except:
    table_cols = []

# --- Resume from checkpoint ---
start_index = 0
if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        start_index = int(f.read().strip())
    print(f"⏩ Resuming from checkpoint at row {start_index}")

# --- Track failures ---
failed_rows = []

# --- Main Batch Loop ---
for start_idx in tqdm(range(start_index, len(df_loaded), BATCH_SIZE), desc="🔄 Processing Batches"):
    end_idx = min(start_idx + BATCH_SIZE, len(df_loaded))
    print(f"\n🚀 Batch {start_idx}–{end_idx - 1}")
    batch_start_time = datetime.now()

    batch = df_loaded.iloc[start_idx:end_idx]
    combined_rows = []

    for idx, row in batch.iterrows():
        try:
            text = str(row.get("full_text", ""))[:1800]
            row_meta = row.drop(labels=["full_text"]).to_dict()

            # --- Run models ---
            parsed = safe_generate_and_parse(input_text=text, tokenizer=llm_tokenizer, model=llm_model, template=TEMPLATE, row_id=idx)
            emotion = get_emotion_vector(text=text, tokenizer=emotion_tokenizer, model=emotion_model, labels=emotion_labels, device=device)

            if parsed["success"] and emotion["success"]:
                combined = {
                    **row_meta,
                    **parsed["data"],
                    **emotion["data"],
                    "row_index": idx,
                    "text": text
                }

                # Flatten lists
                for k, v in combined.items():
                    if hasattr(v, "tolist"):
                        v = v.tolist()
                    if isinstance(v, list) and all(isinstance(i, str) for i in v):
                        combined[k] = "; ".join(v)
                    else:
                        combined[k] = v

                combined_rows.append(combined)
            else:
                failed_rows.append({
                    "row_index": idx,
                    "error": parsed.get("error") or emotion.get("error"),
                    "text": text[:300]
                })

        except Exception as e:
            failed_rows.append({
                "row_index": idx,
                "error": str(e),
                "text": text[:300]
            })

    # --- Save batch to DuckDB ---
    if combined_rows:
        df_combined = pd.DataFrame(combined_rows)

        if table_cols:
            df_combined = df_combined[[col for col in df_combined.columns if col in table_cols]]
        else:
            table_cols = df_combined.columns.tolist()
            conn.execute(f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} AS SELECT * FROM df_combined LIMIT 0")

        conn.execute(f"INSERT INTO {TABLE_NAME} SELECT * FROM df_combined")
        print(f"✅ Batch {start_idx}–{end_idx - 1} saved to {TABLE_NAME}")

        # --- Save checkpoint (start of this batch for retry safety) ---
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(str(start_idx))
        print("📝 Checkpoint updated.")

        # --- Duration log ---
        duration = (datetime.now() - batch_start_time).total_seconds()
        print(f"⏱️ Duration: {round(duration, 2)} seconds")

# --- Close DB connection ---
conn.close()

# --- Save failures if any ---
if failed_rows:
    pd.DataFrame(failed_rows).to_csv("failed_rows.csv", index=False)
    print(f"⚠️ {len(failed_rows)} failed rows logged to failed_rows.csv")

print(f"\n🏁 Done! All processed data written to {TABLE_NAME} in Google Drive DuckDB.")

⏩ Resuming from checkpoint at row 0


🔄 Processing Batches:   0%|          | 0/30 [00:00<?, ?it/s]


🚀 Batch 0–99
❌ [62] JSON error after fix: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)


🔄 Processing Batches:   3%|▎         | 1/30 [15:02<7:16:01, 902.10s/it]

✅ Batch 0–99 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 902.1 seconds

🚀 Batch 100–199
❌ [126] JSON error after fix: Invalid \escape: line 3 column 125 (char 321)


🔄 Processing Batches:   7%|▋         | 2/30 [28:49<6:40:21, 857.90s/it]

✅ Batch 100–199 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 826.96 seconds

🚀 Batch 200–299


🔄 Processing Batches:  10%|█         | 3/30 [42:20<6:16:25, 836.52s/it]

✅ Batch 200–299 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 811.07 seconds

🚀 Batch 300–399


🔄 Processing Batches:  10%|█         | 3/30 [44:09<6:37:26, 883.22s/it]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-4b44390e20f5>", line 51, in <cell line: 0>
    parsed = safe_generate_and_parse(input_text=text, tokenizer=llm_tokenizer, model=llm_model, template=TEMPLATE, row_id=idx)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-12-cec7f6b50207>", line 31, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "<ipython-input-12-cec7f6b50207>", line 50, in safe_generate_and_parse
    output = model.generate(
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packag

TypeError: object of type 'NoneType' has no len()

In [None]:
import os
import duckdb
import pandas as pd
from tqdm import tqdm
from datetime import datetime

# --- Config ---
BATCH_SIZE = 100
TABLE_NAME = "reddit_nlp_features"
DRIVE_SUBFOLDER = "/content/drive/MyDrive/reddit_data"
DRIVE_DB_FILE = os.path.join(DRIVE_SUBFOLDER, "reddit_progress.duckdb")
CHECKPOINT_FILE = "checkpoint.txt"

# --- Ensure Drive is mounted ---
assert os.path.exists(DRIVE_SUBFOLDER), "Google Drive folder not found. Please mount Drive first."

# --- Connect to DuckDB directly in Drive ---
conn = duckdb.connect(DRIVE_DB_FILE)

# --- Fetch schema if table exists ---
try:
    table_cols = [col[1] for col in conn.execute(f"PRAGMA table_info({TABLE_NAME})").fetchall()]
except:
    table_cols = []

# --- Resume from checkpoint or fallback to 300 ---
fallback_start = 300

if os.path.exists(CHECKPOINT_FILE):
    with open(CHECKPOINT_FILE, "r") as f:
        checkpoint_val = int(f.read().strip())
        start_index = max(checkpoint_val, fallback_start)
    print(f"⏩ Resuming from checkpoint at row {start_index}")
else:
    start_index = fallback_start
    print(f"⏩ No checkpoint found. Starting from row {start_index}")



# --- Track failures ---
failed_rows = []

# --- Main Batch Loop ---
for start_idx in tqdm(range(start_index, len(df_loaded), BATCH_SIZE), desc="🔄 Processing Batches"):
    end_idx = min(start_idx + BATCH_SIZE, len(df_loaded))
    print(f"\n🚀 Batch {start_idx}–{end_idx - 1}")
    batch_start_time = datetime.now()

    batch = df_loaded.iloc[start_idx:end_idx]
    combined_rows = []

    for idx, row in batch.iterrows():
        try:
            text = str(row.get("full_text", ""))[:1800]
            row_meta = row.drop(labels=["full_text"]).to_dict()

            # --- Run models ---
            parsed = safe_generate_and_parse(input_text=text, tokenizer=llm_tokenizer, model=llm_model, template=TEMPLATE, row_id=idx)
            emotion = get_emotion_vector(text=text, tokenizer=emotion_tokenizer, model=emotion_model, labels=emotion_labels, device=device)

            if parsed["success"] and emotion["success"]:
                combined = {
                    **row_meta,
                    **parsed["data"],
                    **emotion["data"],
                    "row_index": idx,
                    "text": text
                }

                # Flatten lists
                for k, v in combined.items():
                    if hasattr(v, "tolist"):
                        v = v.tolist()
                    if isinstance(v, list) and all(isinstance(i, str) for i in v):
                        combined[k] = "; ".join(v)
                    else:
                        combined[k] = v

                combined_rows.append(combined)
            else:
                failed_rows.append({
                    "row_index": idx,
                    "error": parsed.get("error") or emotion.get("error"),
                    "text": text[:300]
                })

        except Exception as e:
            failed_rows.append({
                "row_index": idx,
                "error": str(e),
                "text": text[:300]
            })

    # --- Save batch to DuckDB ---
    if combined_rows:
        df_combined = pd.DataFrame(combined_rows)

        if table_cols:
            df_combined = df_combined[[col for col in df_combined.columns if col in table_cols]]
        else:
            table_cols = df_combined.columns.tolist()
            conn.execute(f"CREATE TABLE IF NOT EXISTS {TABLE_NAME} AS SELECT * FROM df_combined LIMIT 0")

        conn.execute(f"INSERT INTO {TABLE_NAME} SELECT * FROM df_combined")
        print(f"✅ Batch {start_idx}–{end_idx - 1} saved to {TABLE_NAME}")

        # --- Save checkpoint (start of this batch for retry safety) ---
        with open(CHECKPOINT_FILE, "w") as f:
            f.write(str(start_idx))
        print("📝 Checkpoint updated.")

        # --- Duration log ---
        duration = (datetime.now() - batch_start_time).total_seconds()
        print(f"⏱️ Duration: {round(duration, 2)} seconds")

# --- Close DB connection ---
conn.close()

# --- Save failures if any ---
if failed_rows:
    pd.DataFrame(failed_rows).to_csv("failed_rows.csv", index=False)
    print(f"⚠️ {len(failed_rows)} failed rows logged to failed_rows.csv")

print(f"\n🏁 Done! All processed data written to {TABLE_NAME} in Google Drive DuckDB.")

⏩ Resuming from checkpoint at row 300


🔄 Processing Batches:   0%|          | 0/27 [00:00<?, ?it/s]


🚀 Batch 300–399
❌ [347] JSON error after fix: Expecting ',' delimiter: line 2 column 161 (char 162)
❌ [390] JSON error after fix: Invalid \escape: line 3 column 144 (char 485)


🔄 Processing Batches:   4%|▎         | 1/27 [14:30<6:17:16, 870.63s/it]

✅ Batch 300–399 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 870.63 seconds

🚀 Batch 400–499
❌ [428] JSON error after fix: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
❌ [470] JSON error after fix: Expecting property name enclosed in double quotes: line 11 column 1 (char 961)


🔄 Processing Batches:   7%|▋         | 2/27 [28:40<5:57:43, 858.53s/it]

✅ Batch 400–499 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 850.06 seconds

🚀 Batch 500–599
❌ [514] JSON error after fix: Expecting ',' delimiter: line 4 column 128 (char 627)


🔄 Processing Batches:  11%|█         | 3/27 [42:03<5:33:10, 832.92s/it]

✅ Batch 500–599 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 802.46 seconds

🚀 Batch 600–699
❌ [652] JSON error after fix: Expecting property name enclosed in double quotes: line 11 column 1 (char 759)
❌ [657] JSON error after fix: Expecting property name enclosed in double quotes: line 11 column 1 (char 1189)


🔄 Processing Batches:  15%|█▍        | 4/27 [56:17<5:22:36, 841.58s/it]

✅ Batch 600–699 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 854.84 seconds

🚀 Batch 700–799
❌ [762] JSON error after fix: Expecting ',' delimiter: line 2 column 158 (char 159)


🔄 Processing Batches:  19%|█▊        | 5/27 [1:11:06<5:14:49, 858.62s/it]

✅ Batch 700–799 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 888.85 seconds

🚀 Batch 800–899
❌ [854] JSON error after fix: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
❌ [888] JSON error after fix: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)


🔄 Processing Batches:  22%|██▏       | 6/27 [1:25:40<5:02:19, 863.80s/it]

✅ Batch 800–899 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 873.85 seconds

🚀 Batch 900–999
❌ [943] JSON error after fix: Expecting ',' delimiter: line 10 column 41 (char 682)
❌ [993] JSON error after fix: Expecting value: line 4 column 286 (char 551)


🔄 Processing Batches:  26%|██▌       | 7/27 [1:39:02<4:41:09, 843.45s/it]

✅ Batch 900–999 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 801.56 seconds

🚀 Batch 1000–1099


🔄 Processing Batches:  30%|██▉       | 8/27 [1:51:40<4:18:30, 816.34s/it]

✅ Batch 1000–1099 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 758.3 seconds

🚀 Batch 1100–1199
❌ [1159] JSON error after fix: Expecting ',' delimiter: line 4 column 74 (char 294)


🔄 Processing Batches:  33%|███▎      | 9/27 [2:04:32<4:00:45, 802.52s/it]

✅ Batch 1100–1199 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 772.13 seconds

🚀 Batch 1200–1299
❌ [1278] JSON error after fix: Expecting ',' delimiter: line 3 column 180 (char 274)


🔄 Processing Batches:  37%|███▋      | 10/27 [2:17:24<3:44:40, 792.97s/it]

✅ Batch 1200–1299 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 771.57 seconds

🚀 Batch 1300–1399
❌ [1348] JSON error after fix: Expecting ',' delimiter: line 2 column 48 (char 49)


🔄 Processing Batches:  41%|████      | 11/27 [2:30:31<3:30:58, 791.18s/it]

✅ Batch 1300–1399 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 787.12 seconds

🚀 Batch 1400–1499


🔄 Processing Batches:  44%|████▍     | 12/27 [2:44:14<3:20:15, 801.03s/it]

✅ Batch 1400–1499 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 823.56 seconds

🚀 Batch 1500–1599


🔄 Processing Batches:  48%|████▊     | 13/27 [2:57:39<3:07:10, 802.18s/it]

✅ Batch 1500–1599 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 804.82 seconds

🚀 Batch 1600–1699
❌ [1653] JSON error after fix: Expecting ',' delimiter: line 3 column 110 (char 428)


🔄 Processing Batches:  52%|█████▏    | 14/27 [3:11:39<2:56:14, 813.43s/it]

✅ Batch 1600–1699 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 839.44 seconds

🚀 Batch 1700–1799
❌ [1742] JSON error after fix: Invalid \escape: line 10 column 148 (char 849)
❌ [1749] JSON error after fix: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)


🔄 Processing Batches:  56%|█████▌    | 15/27 [3:26:17<2:46:36, 833.00s/it]

✅ Batch 1700–1799 saved to reddit_nlp_features
📝 Checkpoint updated.
⏱️ Duration: 878.35 seconds

🚀 Batch 1800–1899


In [14]:
!ls -lh "/content/drive/MyDrive/reddit_data"


total 5.6M
-rw------- 1 root root 5.1M Jun  5 08:32 reddit_20250605_083205.duckdb
-rw------- 1 root root  12K Jun  8 11:06 reddit_progress.duckdb
-rw------- 1 root root 573K Jun  8 11:48 reddit_progress.duckdb.wal


**Section 4. Reload Reddit Scraped Results from Colab**

In [32]:
from google.colab import drive
import os
import duckdb
import pandas as pd

# --- Config ---
TABLE_NAME = "reddit_nlp_features"
LOCAL_DB_FILE = "reddit_progress.duckdb"
DRIVE_SUBFOLDER = "/content/drive/MyDrive/reddit_data"
DRIVE_DB_FILE = os.path.join(DRIVE_SUBFOLDER, LOCAL_DB_FILE)

DRIVE_DB_FILE = os.path.join(DRIVE_SUBFOLDER, LOCAL_DB_FILE)
# --- Mount Google Drive (if not already) ---
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# --- Connect to DB file in Drive ---
conn = duckdb.connect(DRIVE_DB_FILE)

# --- List tables ---
tables = conn.execute("SHOW TABLES").fetchall()
print("📋 Tables found:", tables)

# --- Check if target table exists ---
if (TABLE_NAME,) in tables:
    df_reddit = conn.execute(f"SELECT * FROM {TABLE_NAME}").df()
    print(f"✅ Loaded {len(df_reddit)} rows from table '{TABLE_NAME}'")
    pd.set_option('display.max_columns', None)
    display(df_reddit.head())
else:
    print(f"❌ Table '{TABLE_NAME}' not found in the database.")

conn.close()

📋 Tables found: [('reddit_nlp_features',)]
✅ Loaded 1 rows from table 'reddit_nlp_features'


Unnamed: 0,type,subreddit,author,score,title,matched_terms,url,created_utc,link_flair_text,num_comments,id,comment_depth,summary,pain_points,emotion_phrases,emotion_label,sentiment_score,sentiment,themes,demographic_proximity,demographic_evidence_phrases,admiration,amusement,anger,annoyance,approval,caring,confusion,curiosity,desire,disappointment,disapproval,disgust,embarrassment,excitement,fear,gratitude,grief,joy,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral,row_index,text
0,post,Meditation,Anima_Monday,2,"Equanimity and Non-Attachment Meditation, A Novel Approach",[act],https://reddit.com/r/Meditation/comments/1l3o2jc/equanimity_and_nonattachment_meditation_a_novel/,2025-06-05T02:27:44,Sharing / Insight 💡,0.0,1l3o2jc,,The user developed a meditation practice using phrases for equanimity and non-attachment. The practice involves mentally saying the phrases while breathing naturally and syncing it to the breathing. The goal is to achieve a natural sense of equanimity and non-attachment.,none,fine,calm,0.57,positive,meditation practice; equanimity; non-attachment,unknown,none,0.430545,0.167192,0.182123,0.278183,0.653991,0.916768,0.486951,0.643949,0.64548,0.236684,0.442498,0.225125,0.265867,0.268055,0.285222,0.455119,0.294263,0.363282,0.502222,0.247318,0.316201,0.255882,0.692624,0.653467,0.410916,0.32566,0.338183,0.759731,0,"This practice uses phrases for equanimity and non-attachment in a similar way to the ones for popular forms of metta bhavana (loving kindness) meditation.\n\nI developed this specific form that I am sharing here myself and am being transparent about this, so you can see it as a novel approach. Popular metta practice has phrases that one repeats as part of the process, such as 'may all beings be well'. Below are phrases for equanimity and non-attachment. This is just a possible way to approach it, one way to put the mind in the mode of equanimity and non-attachment, so that this state can then be abided in directly. You can try it out if you are interested. I am just passing on something which has been helpful for me as perhaps it may help others too.\n\nYou can mentally say the phrases, with the first part of each phrase being on the in-breath, and the second part (the 'fine') being on the out-breath. Doing this while breathing naturally, meaning not deliberately controlling the breath. Try it with eyes closed, and then if you wish, you can experiment with eyes open on some occasions to integrate it into the sense experience and make it gradually more available 'off the mat'.\n\nSo mentally saying these phrases while in the relative stillness and safety of a meditation posture, either sitting or lying down, and syncing it to the breathing:\n\n""If it comes ... fine\n\nIf it doesn't come ... fine\n\nIf it changes ... fine\n\nIf it doesn't change ... fine\n\nIf it goes ... fine\n\nIf it doesn't go ... fine""\n\nYou meditate like this until there is a natural sense of equanimity and non-attachment, a natural letting go into that, which might occur after only doing it once through, or it might take a bit longer with more repetitions. It puts the mind closer in attitude to the nature unconditiona"


In [None]:
pd.set_option('display.max_columns', None)
df_combined.head()

Mounted at /content/drive
📋 Tables found: []
❌ Table 'reddit_nlp_features' not found in the database.


In [None]:
df_reddit.head()

**Section 5. Cluster Analysis**

**BACKUP Section 3. LLMs NLP(Alternative-Gemma 2)**

In [None]:
from huggingface_hub import login
import os


from google.colab import userdata
hf_token = userdata.get('HuggingFace')

if hf_token is None:
    raise ValueError("🔐 Hugging Face token not found. Please add it to Colab secrets under 'HF_TOKEN'.")

# Authenticate with Hugging Face Hub
login(token=hf_token)

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install sentence-transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# === HuggingFace Model ID ===
model_id = "google/gemma-2b-it"  # or your preferred size like "gemma-7b-it"

# === Load Tokenizer ===
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token  # ensures padding is valid

# === Load Model (FP16, no quantization) ===
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True,
    token=hf_token
)
model.eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR