# Depression Classifier Dataset
Building a new dataset by scraping Reddit

## 1. Import Libraries

In [29]:
%pip install praw python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [30]:
import os
from dotenv import load_dotenv

load_dotenv()

import pandas as pd
import praw

In [31]:
reddit = praw.Reddit(
    client_id=os.getenv("CLIENT_ID"),
    client_secret=os.getenv("CLIENT_SECRET"),
    user_agent=os.getenv("USER_AGENT"),
)

## 2. Scrape Subreddits

In [32]:
import re

dataset = {
    "post_title": [],
    "is_depression": []
}

def scrape_one(subreddit, limit=10):
    posts = reddit.subreddit(subreddit).hot(limit=limit)
    parsed_posts = [re.sub(r'[^a-z ]', '', post.title.casefold()).strip() for post in posts]
    filtered_posts = [post for post in parsed_posts if len(post) > 0]
    return filtered_posts


def scrape_many(subreddits, label=1, limit=10):
    for subreddit in subreddits:
        posts = scrape_one(subreddit, limit)
        dataset["post_title"].extend(posts)
        dataset["is_depression"].extend([label for i in range(len(posts))])

In [33]:
scrape_many(["depression", "socialanxiety"], label=1, limit=1000)
scrape_many(["happy", "goodnews"], label=0, limit=1000)

## 3. Build Dataset

In [36]:
dataframe = pd.DataFrame(dataset)
dataframe.index.name = "id"
dataframe.to_csv("./datasets/depression_reddit_cleaned.csv")