## Data import and cleaning

___

PRAW to extract top 1k posts for EDA  </br>
PSAW to extract over a range of dates for ML

### Import Libraries

In [1]:
import requests # to send http requests and handle responses
import pandas as pd
import datetime as dt
import time
import praw # interface for reddit API

___

In [2]:
# define start and end dates in string format
start_date_str = "2022-11-01"
end_date_str = "2023-01-28"

In [3]:
# convert start and end dates to Unix timestamps
start_date = int(dt.datetime.strptime(start_date_str, "%Y-%m-%d").timestamp())
end_date = int(dt.datetime.strptime(end_date_str, "%Y-%m-%d").timestamp())

In [4]:
columns_ps =['id', 'created_utc', 'title', 'is_self', 'selftext', 'score', 'upvote_ratio', 'num_comments', 'permalink', 'author', 'distinguished']

In [5]:
def scrape_pushshift(subreddit, start_date_str, end_date_str, columns):
    # define base url
    url = f"https://api.pushshift.io/reddit/submission/search/?subreddit={subreddit}"

    # parameters
    params = {
        "after": start_date_str,
        "before": end_date_str,
        "size": 1000, # the maximum number of posts to retrieve per request
        "fields": columns
    }

    # send HTTP GET request to retrieve response
    response = requests.get(url, params=params)

    # check if response is successful
    if response.status_code == 200:
        # extract list of posts
        data = response.json()
        posts = data["data"]

        # convert list of posts to dataframe
        return pd.DataFrame(posts, columns=columns)
    else:
        # error message if request fails
        raise Exception(f"Request failed with status code {response.status_code}: {response.text}")

In [6]:
# scrape r/anxiety subreddit
df_anxiety_ps = scrape_pushshift('anxiety', start_date, end_date, columns_ps)

In [7]:
df_anxiety_ps.head(3)

Unnamed: 0,id,created_utc,title,is_self,selftext,score,upvote_ratio,num_comments,permalink,author,distinguished
0,10mo9oe,1674834997,For the First time in YEARS I have found mysel...,True,I felt the need to get this off my chest and f...,1,1.0,0,/r/Anxiety/comments/10mo9oe/for_the_first_time...,asomers44,
1,10mo813,1674834885,How does being breathless feel?,True,For some reason I’m breathing out my nose but ...,1,1.0,0,/r/Anxiety/comments/10mo813/how_does_being_bre...,JamesDudleyv,
2,10mo74n,1674834823,Short Survey for breath work app against anxiety,True,[removed],1,1.0,0,/r/Anxiety/comments/10mo74n/short_survey_for_b...,Apprehensive_Form367,


In [8]:
# scrape r/depression subreddit
df_depression_ps = scrape_pushshift('depression', start_date, end_date, columns_ps)

In [9]:
df_depression_ps.head(3)

Unnamed: 0,id,created_utc,title,is_self,selftext,score,upvote_ratio,num_comments,permalink,author,distinguished
0,10moalc,1674835061,I think I finally broke completely,True,"The worst feeling in the world is hope, it mak...",1,1.0,0,/r/depression/comments/10moalc/i_think_i_final...,Sandersgarbanzo,
1,10mo82z,1674834889,i dont know what to do,True,[removed],1,1.0,0,/r/depression/comments/10mo82z/i_dont_know_wha...,towjdjrfjkfje,
2,10mo7bf,1674834836,Do I have depression?,True,"I have an important question, I don't want an ...",1,1.0,0,/r/depression/comments/10mo7bf/do_i_have_depre...,enart_pl,


In [10]:
pd.DataFrame(df_anxiety_ps).to_csv('../datasets/anxiety_ps.csv', index = False)

In [11]:
pd.DataFrame(df_depression_ps).to_csv('../datasets/depression_ps.csv', index = False)

### PRAW

In [12]:
# PRAW setup
reddit = praw.Reddit(client_id='q4ZIN6K8AolTMkmLJosxjQ',
                     client_secret='1snUWeiOiHEJah0J1jMWsJCwCQLIMA',
                     username='Nicholas_Khoo_DS',
                     password='dswebscraping123!',
                     user_agent='Nicholas_Khoo_DS')

Version 7.6.1 of praw is outdated. Version 7.7.0 was released 7 days ago.


In [13]:
print(reddit.user.me())

Nicholas_Khoo_DS


In [14]:
# define columns to extract
columns_praw = ['id', 'created_utc', 'title', 'is_self', 'selftext', 'score', 'upvote_ratio', 'num_comments', 'permalink', 'author', 'distinguished']

In [15]:
subreddit = reddit.subreddit('depression')
num_top_posts = subreddit.top(limit=None)  # None sets no limit
print(f"The number of top posts in r/depression is {len(list(num_top_posts))}.")

The number of top posts in r/depression is 969.


In [16]:
subreddit = reddit.subreddit('anxiety')
num_top_posts = subreddit.top(limit=None)  # None sets no limit
print(f"The number of top posts in r/depression is {len(list(num_top_posts))}.")

The number of top posts in r/depression is 996.


In [17]:
def scrape_reddit_top(subreddit, columns):

    # to store posts
    posts = []
    
    # set limit and time delay to prevent IP ban
    limit = 1000
    delay = 2
    
    # get top posts and append to list
    for submission in reddit.subreddit(subreddit).top(limit=limit):
        post = {}
        for col in columns:
            post[col] = getattr(submission, col)
        posts.append(post)
        time.sleep(delay)
    
    # convert list of dictionaries to dataframe
    df = pd.DataFrame(posts)
    
    return df

In [18]:
df_anxiety_praw = scrape_reddit_top('anxiety', columns_praw)

In [19]:
df_depression_praw = scrape_reddit_top('depression', columns_praw)

In [20]:
df_anxiety_praw.head(3)

Unnamed: 0,id,created_utc,title,is_self,selftext,score,upvote_ratio,num_comments,permalink,author,distinguished
0,8zcjh1,1531760000.0,"Despite the anxiety, despite the depression, d...",False,,7836,0.99,293,/r/Anxiety/comments/8zcjh1/despite_the_anxiety...,WoollyNinja,
1,b18zmk,1552615000.0,Today I promised myself bacon and eggs if I co...,False,,6048,0.99,242,/r/Anxiety/comments/b18zmk/today_i_promised_my...,Becky-and-Momo,
2,asl2gr,1550645000.0,"Professor here, if you have missed class, plea...",True,"Hello, I’ve read a lot of posts recently about...",5205,1.0,207,/r/Anxiety/comments/asl2gr/professor_here_if_y...,Lofty_Incantations11,


In [21]:
df_depression_praw.head(3)

Unnamed: 0,id,created_utc,title,is_self,selftext,score,upvote_ratio,num_comments,permalink,author,distinguished
0,cd0hjp,1563091000.0,Shout out to the particular hell that is funct...,True,"This is me. Don’t get me wrong, it’s better th...",6678,1.0,348,/r/depression/comments/cd0hjp/shout_out_to_the...,fallen-fawn,
1,i3ajk8,1596506000.0,I hate that people don’t understand that i don...,True,,4723,1.0,260,/r/depression/comments/i3ajk8/i_hate_that_peop...,Nathanvic13,
2,ccaxvm,1562939000.0,If 10 years ago someone told me that in 10 yea...,True,"But here I am, 24 years old man and doing exac...",4416,1.0,218,/r/depression/comments/ccaxvm/if_10_years_ago_...,coolmast3r,


In [22]:
pd.DataFrame(df_anxiety_praw).to_csv('../datasets/anxiety_praw.csv', index = False)

In [23]:
pd.DataFrame(df_depression_praw).to_csv('../datasets/depression_praw.csv', index = False)