## Import Relevant Packages / Libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
pd.set_option('max_columns', 75)
import requests
import json

## Data Acquisition  - Depression Subreddit

In [3]:
# first pull from depression subreddit
base_url_submissions = 'https://api.pushshift.io/reddit/search/submission/?'
base_url_comment = 'https://api.pushshift.io/reddit/search/comment/?'

params_depression = {
    'subreddit': 'depression',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '30d',
    'metadata':'false'
}

depression_res_submissions = requests.get(base_url_submissions, params_depression)
depression_res_comment = requests.get(base_url_comment, params_depression)

# Confirming that requests are valid
print(depression_res_submissions.status_code)
print(depression_res_comment.status_code)

200
200


In [4]:
# converting first depression pull into a dataframe
depression_df_submissions = pd.DataFrame(depression_res_submissions.json()['data'])

In [5]:
# second pull from the depression subreddit
base_url_submissions_2 = 'https://api.pushshift.io/reddit/search/submission/?'
base_url_comment_2 = 'https://api.pushshift.io/reddit/search/comment/?'

params_depression_2 = {
    'subreddit': 'depression',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '60d',
    'metadata':'false'
}

depression_res_submissions_2 = requests.get(base_url_submissions_2, params_depression_2)
depression_res_comment_2 = requests.get(base_url_comment_2, params_depression_2)

# Confirming that requests are valid
print(depression_res_submissions_2.status_code)
print()

200



In [6]:
# converting second depression pull into a dataframe
depression_df_submissions_2 = pd.DataFrame(depression_res_submissions_2.json()['data'])

In [7]:
# third pull from depression sub-reddit
base_url_submissions_3 = 'https://api.pushshift.io/reddit/search/submission/?'
base_url_comment_3 = 'https://api.pushshift.io/reddit/search/comment/?'

params_depression_3 = {
    'subreddit': 'depression',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '90d',
    'metadata':'false'
}

depression_res_submissions_3 = requests.get(base_url_submissions_3, params_depression_3)
depression_res_comment_3 = requests.get(base_url_comment_3, params_depression_3)

# confirming that requests are valid
print(depression_res_submissions_3.status_code)

200


In [8]:
# converting third depression pull into a dataframe
depression_df_submissions_3 = pd.DataFrame(depression_res_submissions_3.json()['data'])

In [9]:
# fourth pull from depression sub-reddit
base_url_submissions_4 = 'https://api.pushshift.io/reddit/search/submission/?'
base_url_comment_4 = 'https://api.pushshift.io/reddit/search/comment/?'

params_depression_4 = {
    'subreddit': 'depression',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '120d',
    'metadata':'false'
}

depression_res_submissions_4 = requests.get(base_url_submissions_4, params_depression_4)
depression_res_comment_4 = requests.get(base_url_comment_4, params_depression_4)

# confirming request is valid:
print(depression_res_submissions_4.status_code)

200


In [10]:
# converting fourth pull into a dataframe
depression_df_submissions_4 = pd.DataFrame(depression_res_submissions_4.json()['data'])

In [11]:
# merging all pulls into one master dataframe 'depression_all'
depression_all = pd.concat([depression_df_submissions, depression_df_submissions_2, 
                            depression_df_submissions_3, depression_df_submissions_4])

#creating a 'target' column to signify all the data in this column came from the 'depression' sub-reddit
# which will be used for our target variable later
depression_all['target'] = 'depression'
# inspecting values of the target variable
depression_all['target'].value_counts()

depression    2000
Name: target, dtype: int64

In [12]:
# inspecting shape of the dataframe as a whole
depression_all.shape

(2000, 68)

In [13]:
# evaluating columns for potential inclusion in our features later
depression_all.columns

Index(['all_awardings', 'allow_live_comments', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_text', 'author_flair_text_color',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'banned_by', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gilded',
       'gildings', 'id', 'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwls', 'removed_by', 'removed_by_category', 'retrieved_on',
       'score', 'selftext', 'send_replies', 'spoiler', 'steward_

In [14]:
# creating final dataframe with features to be used in our NLP Analysis below
depression_final = depression_all[['target', 'selftext', 'title']]

## Data Acquisition  - Anxiety Subreddit

In [15]:
# first pull from the anxiety subreddit
base_url_submissions = 'https://api.pushshift.io/reddit/search/submission/?'
base_url_comment = 'https://api.pushshift.io/reddit/search/comment/?'

params_anxiety = {
    'subreddit': 'anxiety',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '30d',
    'metadata':'false'
}

anxiety_res_submissions = requests.get(base_url_submissions, params_anxiety)
anxiety_res_comment = requests.get(base_url_comment, params_anxiety)

#confirming the requests is valid
print(anxiety_res_submissions.status_code)

200


In [16]:
# converting first anxiety pull to a dataframe
anxiety_df_submissions = pd.DataFrame(anxiety_res_submissions.json()['data'])

In [17]:
# second pull from the anxiety subreddit
base_url_submissions = 'https://api.pushshift.io/reddit/search/submission/?q'
base_url_comment = 'https://api.pushshift.io/reddit/search/comment/?q'

params_anxiety_2 = {
    'subreddit': 'anxiety',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '60d',
    'metadata':'false'
}

anxiety_res_submissions_2 = requests.get(base_url_submissions, params_anxiety_2)
anxiety_res_comment_2 = requests.get(base_url_comment, params_anxiety_2)
# confirming the request is valid
print(anxiety_res_submissions_2.status_code)

200


In [18]:
# converting the second pull to a data frame
anxiety_df_submissions_2 = pd.DataFrame(anxiety_res_submissions_2.json()['data'])

In [19]:
# third pull from the anxiety subreddit
base_url_submissions = 'https://api.pushshift.io/reddit/search/submission/?q'
base_url_comment = 'https://api.pushshift.io/reddit/search/comment/?q'

params_anxiety_3 = {
    'subreddit': 'anxiety',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '90d',
    'metadata':'false'
}

anxiety_res_submissions_3 = requests.get(base_url_submissions, params_anxiety_3)
anxiety_res_comment_3 = requests.get(base_url_comment, params_anxiety_3)
# confirming the request is valid
print(anxiety_res_submissions_3.status_code)

200


In [20]:
# converting the third pull to a dataframe
anxiety_df_submissions_3 = pd.DataFrame(anxiety_res_submissions_3.json()['data'])

In [21]:
# fourth pull from the anxiety subreddit
base_url_submissions = 'https://api.pushshift.io/reddit/search/submission/?q'
base_url_comment = 'https://api.pushshift.io/reddit/search/comment/?q'

params_anxiety_4 = {
    'subreddit': 'anxiety',
    'is_video': 'false',
    'size': 500, # can set a maximum pull of 500
    'after': '120d',
    'metadata':'false'
}

anxiety_res_submissions_4 = requests.get(base_url_submissions, params_anxiety_4)
anxiety_res_comment_4 = requests.get(base_url_comment, params_anxiety_4)
# confirming the request is valid
print(anxiety_res_submissions_4.status_code)

200


In [22]:
# converting the fourth pull to a dataframe
anxiety_df_submissions_4 = pd.DataFrame(anxiety_res_submissions_4.json()['data'])

In [23]:
# creating final dataframe with features to be used in our NLP Analysis below
anxiety_all = pd.concat([anxiety_df_submissions, anxiety_df_submissions_2,
                        anxiety_df_submissions_3, anxiety_df_submissions_4])
#creating a 'target' column to signify all the data in this column came from the 'anxiety' sub-reddit
# which will be used for our target variable later
anxiety_all['target'] = 'anxiety'
# inspecting values of the target variable
anxiety_all['target'].value_counts()

anxiety    2000
Name: target, dtype: int64

In [24]:
# inspecting shape of the dataframe as a whole
anxiety_all.shape

(2000, 72)

In [25]:
# evaluating columns for potential inclusion in our features later
anxiety_all.columns

Index(['all_awardings', 'allow_live_comments', 'author', 'author_cakeday',
       'author_flair_background_color', 'author_flair_css_class',
       'author_flair_richtext', 'author_flair_template_id',
       'author_flair_text', 'author_flair_text_color', 'author_flair_type',
       'author_fullname', 'author_patreon_flair', 'author_premium', 'awarders',
       'banned_by', 'can_mod_post', 'contest_mode', 'created_utc', 'domain',
       'edited', 'full_link', 'gilded', 'gildings', 'id', 'is_crosspostable',
       'is_meta', 'is_original_content', 'is_reddit_media_domain',
       'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'post_hint',
       'preview', 'pwl

In [26]:
# creating final dataframe with features to be used in our NLP Analysis below
anxiety_final = anxiety_all[['target', 'selftext', 'title']]

In [27]:
# combining the consolidated into a final dataframe
reddits_all = pd.concat([depression_final, anxiety_final])

In [28]:
# inspecting the final dataframe
reddits_all.head(10)

Unnamed: 0,target,selftext,title
0,depression,2019 was probably one of the worst year of my ...,Hello 2020 and fuck you 2019.
1,depression,I saw your post on this subreddit. I saw that ...,To my new hopeful friend on the other side of ...
2,depression,"I feel like I've been hanging on by a thread, ...",can't imagine making it to 2030
3,depression,,Why do I have the strongest urge to jump out o...
4,depression,I've been clean from self harm for over two ye...,Suicidal thoughts are so intense
5,depression,My girlfriend’s parents always have a new year...,Might have to meet my girlfriend’s rapist toni...
6,depression,I don't have the motivation to succeed like ot...,Everyone else has it all and I don't. Am I cra...
7,depression,I just wanna live and see society die\n\n\n\nB...,Lately I decided I'mma stay alive
8,depression,This is going to be long and honestly if I wer...,Insert witty title for post I just want to mak...
9,depression,It's what I keep repeating to myself. \n\nA co...,I was doing so good


In [29]:
# converting consolidated dataframe into a csv so that it can be read into a new notebook
reddits_all.to_csv('./reddits.csv', index=False)