In [None]:
pip install praw

In [None]:
pip install pandas requests openpyxl # openpyxl is needed for Excel output

In [None]:
import praw
import datetime
import time
import pandas as pd
import requests
import os

# --- Reddit API Credentials ---
# IMPORTANT: Replace with your actual credentials.
CLIENT_ID = "enter_client_id_here"
CLIENT_SECRET = "enter_client_secret_here"
USER_AGENT = "MyRomemesScraper by /u/Objective_Team621 v1.0" # Replace with your Reddit username
USERNAME = "enter_reddit_username" # Your Reddit username
PASSWORD = "enter_reddit_pass" # Your Reddit password

# --- Configuration ---
SUBREDDIT_NAME = "romemes"
TARGET_DATE = datetime.datetime(2025, 5, 1, 0, 0, 0, tzinfo=datetime.timezone.utc) # May 1st, 2025, 00:00:00 UTC
FETCH_LIMIT = 1000 # Set to None for unlimited, or a number for testing. Be mindful of rate limits.

OUTPUT_CSV_FILE = "romemes_posts.csv"
OUTPUT_EXCEL_FILE = "romemes_posts.xlsx"
DOWNLOAD_FOLDER = "romemes_downloads" # Folder to save images/media

def get_posts_after_date(subreddit_name, target_date, fetch_limit=None):
    """
    Connects to Reddit API, fetches new posts from a subreddit,
    and returns those created after the target_date.
    """
    try:
        reddit = praw.Reddit(
            client_id=CLIENT_ID,
            client_secret=CLIENT_SECRET,
            user_agent=USER_AGENT,
            username=USERNAME,
            password=PASSWORD
        )
        print(f"Successfully connected to Reddit as {reddit.user.me()} (read_only: {reddit.read_only})")
    except Exception as e:
        print(f"Error connecting to Reddit: {e}")
        print("Please check your API credentials and ensure your Reddit account is active.")
        return []

    subreddit = reddit.subreddit(subreddit_name)
    posts_data = []

    print(f"\nFetching posts from r/{subreddit_name} after {target_date.strftime('%Y-%m-%d %H:%M:%S UTC')}...")

    # Iterate through new submissions
    for submission in subreddit.new(limit=fetch_limit):
        post_created_utc = datetime.datetime.fromtimestamp(submission.created_utc, tz=datetime.timezone.utc)

        if post_created_utc > target_date:
            # Generate a unique ID (Unix timestamp + submission ID)
            unique_id = f"{int(submission.created_utc)}_{submission.id}"
            
            posts_data.append({
                "id": unique_id, # Unique identifier
                "title": submission.title,
                "author": submission.author.name if submission.author else "[deleted]",
                "score": submission.score,
                "num_comments": submission.num_comments,
                "created_utc": post_created_utc.strftime('%Y-%m-%d %H:%M:%S UTC'),
                "url": submission.url,
                "permalink": f"https://reddit.com{submission.permalink}",
                "is_self": submission.is_self, # True if text post, False if link/image/video
                "media_url": submission.url if not submission.is_self else None # URL to download
            })
        else:
            # Since .new() returns posts by most recent, if we find a post
            # that's older than our target date, we can stop searching.
            print(f"Stopping search: encountered post older than target date (Post ID: {submission.id}, Created: {post_created_utc})")
            break

        time.sleep(0.1) # Small delay for politeness

    return posts_data

def download_media(posts, download_folder):
    """
    Downloads media from post URLs to the specified folder.
    """
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)
        print(f"Created download folder: {download_folder}")

    print(f"\nAttempting to download media to '{download_folder}'...")
    downloaded_count = 0

    for post in posts:
        media_url = post.get("media_url")
        post_id = post.get("id")
        post_title = post.get("title", "untitled_post").replace("/", "_").replace("\\", "_") # Sanitize title for filename
        
        if media_url and not post.get("is_self"): # Only download if it's a link post (not a text post)
            try:
                # Get file extension from the URL
                file_extension = os.path.splitext(media_url)[1].split("?")[0] # handle query parameters
                if not file_extension: # Fallback for URLs without explicit extensions, e.g., imgur links without .jpg
                    # Try to infer from content type or use a common default
                    response_head = requests.head(media_url, allow_redirects=True, timeout=5)
                    content_type = response_head.headers.get('content-type')
                    if content_type and 'image' in content_type:
                        if 'jpeg' in content_type: file_extension = '.jpeg'
                        elif 'png' in content_type: file_extension = '.png'
                        elif 'gif' in content_type: file_extension = '.gif'
                        # Add other types as needed
                    else:
                        print(f"  Skipping: Could not determine file type for {media_url}")
                        continue
                
                # Create a sanitized filename based on title or ID
                # Limit title length for filename to avoid OS issues
                sanitized_title = post_title[:50] if len(post_title) > 50 else post_title
                filename = f"{post_id}_{sanitized_title}{file_extension}"
                filepath = os.path.join(download_folder, filename)

                print(f"  Downloading: {media_url} to {filepath}")
                response = requests.get(media_url, stream=True, timeout=10)
                response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)

                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                downloaded_count += 1
            except requests.exceptions.RequestException as e:
                print(f"  Error downloading {media_url} (Post ID: {post_id}): {e}")
            except Exception as e:
                print(f"  An unexpected error occurred for {media_url} (Post ID: {post_id}): {e}")
        elif post.get("is_self"):
            print(f"  Skipping text post: {post_title} (ID: {post_id})")
        else:
            print(f"  Skipping: No media URL found for {post_title} (ID: {post_id})")
            
    print(f"\nFinished downloading. {downloaded_count} media files successfully downloaded.")

if __name__ == "__main__":
    posts_data = get_posts_after_date(SUBREDDIT_NAME, TARGET_DATE, fetch_limit=FETCH_LIMIT)

    if posts_data:
        # 1. Create CSV/Excel output
        print(f"\nFound {len(posts_data)} posts. Creating CSV and Excel files...")
        df = pd.DataFrame(posts_data)
        
        # Select and reorder columns for clarity in output files
        output_df = df[[
            "id",
            "title",
            "author",
            "score",
            "num_comments",
            "created_utc",
            "url",
            "permalink"
        ]]

        output_df.to_csv(OUTPUT_CSV_FILE, index=False, encoding='utf-8')
        print(f"Data saved to {OUTPUT_CSV_FILE}")

        output_df.to_excel(OUTPUT_EXCEL_FILE, index=False)
        print(f"Data saved to {OUTPUT_EXCEL_FILE}")

        # 2. Download media
        download_media(posts_data, DOWNLOAD_FOLDER)
    else:
        print(f"No posts found in r/{SUBREDDIT_NAME} after {TARGET_DATE.strftime('%Y-%m-%d %H:%M:%S UTC')} within the fetched limit.")
        print("Please check your API credentials and the subreddit activity for the specified date.")