### Extraction of Posts and Comments from Reddit

In this project, we focus on extracting posts and comments from Reddit using web scraping techniques and interaction with the API. Reddit is a platform rich in user-generated content on various topics, making it a valuable resource for applications such as sentiment analysis, trend tracking, and topic modeling.

#### Objectives
- **Data Collection**: Retrieve posts and comments from specific subreddits based on thematic criteria (e.g., r/mentalhealth, r/fitness).
- **Data Processing**: Clean and preprocess the extracted data to prepare it for analysis.
- **Data Storage**: Store the collected data in a structured format, such as a CSV file or database, for later analysis.

#### Tools and Technologies
- **Python**: The primary programming language for web scraping and interacting with the API.
- **Requests**: A library for making HTTP requests, in case additional scraping is required.
- **Pandas**: A data manipulation library for handling and analyzing the extracted data.

#### Getting Started
1. **Set Up the Environment**: Install necessary libraries using pip (`praw`, `requests`, `pandas`).
2. **Obtain API Credentials**: Create a Reddit account and register an application to get API credentials (client ID, secret, and user agent).
3. **Define the Extraction Logic**: Write functions to extract data from specific subreddits or threads based on keywords or categories.
4. **Run the Scraper**: Launch the script and monitor the data collection process.
5. **Analyze the Data**: Use Pandas to analyze the collected posts and comments for insights.

#### Conclusion
This project provides a hands-on introduction to using Reddit's API and analyzing data with Python, while also allowing manipulation of data from a dynamic online community.


<p style="color:#FBCE60;text-align:center;font-size:30px"> Scraping Reddit's  Posts And Articles </p>

In [None]:
# Installing BeautifulSoup4
!pip install bs4

# Installing Selenium
!pip install selenium





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





### Scraping Reddit's Health related Topics

In [2]:
import asyncio
import aiohttp
from bs4 import BeautifulSoup

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.137 Safari/537.36'
}

async def fetch_page(session, postUrl):
    """Fetch the page content asynchronously."""
    try:
        async with session.get(postUrl, headers=headers, timeout=10) as response:
            if response.status == 200:
                print(f"Navigating to {postUrl}")
                return await response.text()
            else:
                print(f"Failed to fetch {postUrl}: HTTP {response.status}")
                return None
    except Exception as e:
        print(f"Error fetching {postUrl}: {e}")
        return None

async def collect_subreddit_post_text(postUrl):
    """Collect the text body of a subreddit post asynchronously."""
    async with aiohttp.ClientSession() as session:
        page_source = await fetch_page(session, postUrl)
        if page_source:
            soup = BeautifulSoup(page_source, 'html.parser')
            response_element = soup.find("div", slot="text-body")
            if response_element:
                paragraphs = [p.get_text(strip=True) for p in response_element.find_all("p")]
                return " ".join(paragraphs)
            else:
                print(f"No content found at {postUrl}")
                return ""
        else:
            return ""

import nest_asyncio
nest_asyncio.apply()
async def getPostText(post_url):
    return await collect_subreddit_post_text(post_url)



In [1]:
import urllib.request
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import time
from datetime import datetime, timezone
from urllib.request import urlopen, Request
import pandas as pd 
# Current timestamp
now = datetime.now(timezone.utc)

# function to periodically save collected data 
def periodicSave(data,topic):
    data=pd.DataFrame(data)
    data.to_csv(f"../data/healthRedditPosts/redditPosts{topic}.csv")
    print("Periodic Save is done , Total of saved posts is  " , len(data))
# Function to collect health-related subreddits
def collectSubRedditsPosts(url,topic,posts):
    time.sleep(5)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.137 Safari/537.36'}
    request = Request(url, headers=headers)

    # Open the URL and read the page content
    with urlopen(request) as response:
        page_source = response.read()
    print(f"Navigating to {url}")

    # Parse the page source with BeautifulSoup
    soup = BeautifulSoup(page_source, 'html.parser')

    # Find and process posts
    
    posts_elements = soup.find_all("div",class_="thing")
    for post_element in posts_elements:
        try:
            post = {
                "authorName": post_element.get("data-author"),
                "authorId": post_element.get("data-author-fullname"),
                "commentCount": post_element.get("data-comments-count"),
                "commentsLink": post_element.get("data-url"),
                "createdAt": post_element.get("data-timestamp"),
                "postId": post_element.get("id"),
                "postTitle": post_element.find("a",class_="title may-blank").get_text(),
                "subredditName": post_element.get("data-subreddit-prefixed"),
                "collectedAt": now.strftime("%Y-%m-%dT%H:%M:%S.") + str(now.microsecond).ljust(6, '0') + "+0000",
                "interactionCategory": post_element.find("span", class_="flairrichtext").get_text("title")
                if post_element.find("span", class_="flairrichtext") else "N/A",
            }
            print(post)
            posts.append(post)
            if(len(posts)%100==0):
                periodicSave(posts,topic)
                
        except:
            continue
    try:
        nextButton=soup.find("a",rel="nofollow next").get("href")
        if(nextButton):
            collectSubRedditsPosts(nextButton,topic,posts)
    except :
        print("no next button found . Stoppig Scroll ")
        periodicSave(posts,topic)
        return 
    return 


In [2]:
import pandas as pd 
topicsList=pd.read_csv("../data/healthRedditPosts/healthRedditCommunities.csv")


In [None]:
import pandas as pd 
import time
collectedPosts=[]

for index in range(len(topicsList)):
    topic=topicsList.iloc[index]
    topicName=topic["topicName"]
    baseUrl=topic["topicUrl"]
    extendedUrls = [
            baseUrl,
            baseUrl + "/new/", 
            baseUrl + "/rising/",  
            baseUrl + "/controversial/",  
            baseUrl + "/controversial/?sort=controversial&t=all",  
            baseUrl + "/controversial/?sort=controversial&t=month", 
            baseUrl + "/controversial/?sort=controversial&t=year",  
            baseUrl + "/controversial/?sort=controversial&t=week",  
            baseUrl + "/controversial/?sort=controversial&t=hour",  
            baseUrl + "/top/",  
            baseUrl + "/top/?sort=controversial&t=all",  
            baseUrl + "/top/?sort=controversial&t=month",
            baseUrl + "/top/?sort=controversial&t=year",  
            baseUrl + "/top/?sort=controversial&t=week",  
            baseUrl + "/top/?sort=controversial&t=hour" 
        ]
    for topicUrl in extendedUrls:
        collectedPosts=collectSubRedditsPosts(topicUrl,topicName,[])


Navigating to https://old.reddit.com/r/ADHD
{'authorName': 'AutoModerator', 'authorId': 't2_6l4z3', 'commentCount': '18', 'commentsLink': '/r/ADHD/comments/1hdvidz/did_you_do_something_youre_proud_of_something/', 'createdAt': '1734152462000', 'postId': 'thing_t3_1hdvidz', 'postTitle': "Did you do something you're proud of? Something nice happen? Share your good news with us!", 'subredditName': 'r/ADHD', 'collectedAt': '2024-12-19T20:13:17.941059+0000', 'interactionCategory': 'N/A'}
{'authorName': 'Rich-Wolverine8912', 'authorId': 't2_14qqqlmoto', 'commentCount': '512', 'commentsLink': '/r/ADHD/comments/1gw1cs3/what_are_your_adhd_home_hacks/', 'createdAt': '1732143952000', 'postId': 'thing_t3_1gw1cs3', 'postTitle': 'What are your ADHD home hacks?', 'subredditName': 'r/ADHD', 'collectedAt': '2024-12-19T20:13:17.941059+0000', 'interactionCategory': 'Tips/Suggestions'}
{'authorName': '_DeifyTheMachine_', 'authorId': 't2_qyica', 'commentCount': '78', 'commentsLink': '/r/ADHD/comments/1hhyzg

In [1]:
import pandas as pd 
dx=pd.read_json("../data/healthRedditPosts/redditPostsADHD.json")

In [4]:
dx["commentsLink"].nunique()

5885

#### Collecting posts texts 

In [2]:
import pandas as pd
import asyncio
import aiohttp
import os
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# File paths
POSTS_FILE = "../data/healthRedditPosts/redditPosts.csv"
COLLECTED_FILE = "../data/healthRedditPosts/redditPosts_with_text.csv"
TEMP_FILE = "../data/healthRedditPosts/redditPostsDataTemp.csv"

# Function to append multiple records to the file
def append_records_to_file(records, file_path=TEMP_FILE):
    if records.empty:
        logging.warning("No records to save.")
        return
    try:
        write_header = not os.path.exists(file_path)  # Add header if file doesn't exist
        records.to_csv(file_path, mode="a", header=write_header, index=False, encoding="utf-8")
        logging.info(f"{len(records)} records appended to {file_path}.")
    except Exception as e:
        logging.error(f"Error saving records to {file_path}: {e}")

# Asynchronous function to fetch post text
async def fetch_post_text(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        async with aiohttp.ClientSession() as session:
            async with session.get(url, headers=headers) as response:
                if response.status == 200:
                    data = await response.text()
                    # Simulate processing the text (e.g., BeautifulSoup parsing)
                    return data.strip()
                else:
                    logging.warning(f"Failed to fetch {url}: HTTP {response.status}")
                    return "Failed to fetch post."
    except Exception as e:
        logging.error(f"Error fetching {url}: {e}")
        return "An error occurred while scraping the post."

# Asynchronous function to process a single post
async def process_post(index, dataset):
    try:
        post = dataset.iloc[index]
        postLink = post["commentsLink"]
        full_url = "https://reddit.com" + postLink
        postText = await fetch_post_text(full_url)
        post["postText"] = postText  # Update the postText field
        logging.info(f"Processed post {index + 1}/{len(dataset)}")
        return post
    except Exception as e:
        logging.error(f"Error processing post at index {index}: {e}")
        return None

# Asynchronous function to process all posts
async def process_posts(dataset):
    temp_data = []
    collectedData = 0
    tasks = [process_post(i, dataset) for i in range(len(dataset))]
    
    for future in asyncio.as_completed(tasks):
        result = await future
        if result is not None:
            temp_data.append(result)
            collectedData += 1
        
        if len(temp_data) >= 100:  # Save in batches of 100 records
            append_records_to_file(pd.DataFrame(temp_data))
            temp_data = []

    # Save any remaining data
    if temp_data:
        append_records_to_file(pd.DataFrame(temp_data))

# Main function to execute the process
async def main():
    dataset = pd.read_csv(POSTS_FILE)
    collectedPosts = pd.read_csv(COLLECTED_FILE) if os.path.exists(COLLECTED_FILE) else pd.DataFrame(columns=["commentsLink", "postText"])
    
    collectedLinks_set = set(collectedPosts["commentsLink"])
    allLinks_set = set(dataset["commentsLink"])
    toCollectLinks = dataset[dataset["commentsLink"].isin(allLinks_set - collectedLinks_set)]

    await process_posts(toCollectLinks)
    logging.info("All posts processed.")

# Ensure compatibility with Jupyter notebooks
import nest_asyncio
nest_asyncio.apply()

# Run the asynchronous main function
asyncio.run(main())


FileNotFoundError: [Errno 2] No such file or directory: '../data/healthRedditPosts/redditPosts.csv'