## 1. Data Collection

In [35]:
import requests
import pandas as pd
import time
import random

### 1.1 Extracting data from the r/domesticviolence subreddit page

In [36]:
dom_url = 'https://www.reddit.com/r/domesticviolence.json'

In [37]:
res_dom = requests.get(dom_url, headers={'User-agent': 'Marianne'})

In [38]:
res_dom.status_code

200

In [42]:
# Reddict's data is organised as a dictionary
reddit_dom_dict = res_dom.json()
reddit_dom_dict.keys()

dict_keys(['kind', 'data'])

In [51]:
# Reviewing Reddit's keys
reddit_dom_dict['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [106]:
reddit_dom_dict['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'domesticviolence',
 'selftext': 'We know many of you are struggling to manage with already traumatic events and now are dealing with a global pandemic of COVID-19. Many of you may be quarantined with an abuser or dealing with their ramped up abuse due to their proximity or need for that outlet. Abusers are coming back from years ago to get to you with hoovers. Being isolated is very hard and this is a situation completely unexpected and anxiety driving in and of itself. So we wanted to put together a listing of resources, many of which are listed in our resource listing in the sidebar for you in this difficult time. Stay safe out there, folks. We are right here with you, and we will get through this together. \n\n\nSupport for Domestic Abuse:\n\n* [Thehotline.org]( https://www.thehotline.org/) is available 24/7 for chat and calls (1800-787-3224) during this crisis for women, men as well as LGBTQ folks. Please be sure to use safe electronics to c

In [110]:
# Get subreddit name
reddit_dom_dict['data']['children'][0]['data']['subreddit_name_prefixed']

'r/domesticviolence'

In [125]:
# Get id of post - can be used to view individual posts
# e.g. https://www.reddit.com/r/domesticviolence/comments/fsrd59/
reddit_dom_dict['data']['children'][0]['data']['id']

'fsrd59'

In [122]:
# Get date of post

import datetime
created_utc = reddit_dom_dict['data']['children'][0]['data']['created_utc']
datetime.datetime.utcfromtimestamp(created_utc).strftime('%Y-%m-%d')

'2020-04-01'

In [41]:
# Check first post's title with webpage
reddit_dom_dict['data']['children'][0]['data']['title']

'COVID-19 RESOURCES FOR ABUSE VICTIMS'

In [104]:
# Check first post's content with webpage
reddit_dom_dict['data']['children'][0]['data']['selftext']

'We know many of you are struggling to manage with already traumatic events and now are dealing with a global pandemic of COVID-19. Many of you may be quarantined with an abuser or dealing with their ramped up abuse due to their proximity or need for that outlet. Abusers are coming back from years ago to get to you with hoovers. Being isolated is very hard and this is a situation completely unexpected and anxiety driving in and of itself. So we wanted to put together a listing of resources, many of which are listed in our resource listing in the sidebar for you in this difficult time. Stay safe out there, folks. We are right here with you, and we will get through this together. \n\n\nSupport for Domestic Abuse:\n\n* [Thehotline.org]( https://www.thehotline.org/) is available 24/7 for chat and calls (1800-787-3224) during this crisis for women, men as well as LGBTQ folks. Please be sure to use safe electronics to contact them or any agency if an abuser has access to them. They also offe

In [56]:
# Extract r/domesticviolence posts
dv_posts = [p['data'] for p in reddit_dom_dict['data']['children']]

# Convert to dataframe
pd.DataFrame(dv_posts).head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,link_flair_template_id
0,,domesticviolence,We know many of you are struggling to manage w...,t2_2egrzrvq,False,,0,False,COVID-19 RESOURCES FOR ABUSE VICTIMS,[],...,/r/domesticviolence/comments/fsrd59/covid19_re...,,True,https://www.reddit.com/r/domesticviolence/comm...,10658,1585710000.0,1,,False,
1,,domesticviolence,Not sure if what I'm experiencing could be ver...,t2_6zd0zopf,False,,0,False,Is this verbal or emotional abuse or am I over...,[],...,/r/domesticviolence/comments/hcbfp4/is_this_ve...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592610000.0,0,,False,
2,,domesticviolence,,t2_3frxp7eu,False,,0,False,I’m very interested in this sub. Is it okay fo...,[],...,/r/domesticviolence/comments/hbq0hr/im_very_in...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592525000.0,0,,False,
3,,domesticviolence,I had a case of sexual assault and domestic v...,t2_5d7wihfd,False,,0,False,Restraining order expired and he is posting vi...,[],...,/r/domesticviolence/comments/hbt42e/restrainin...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592537000.0,0,,False,
4,,domesticviolence,\nThis is my life story.\n\nI am 30 year old f...,t2_1cerrsr4,False,,0,False,8 years of abusive marriage,[],...,/r/domesticviolence/comments/hbjyoc/8_years_of...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592504000.0,0,,False,a7a37eae-5cda-11ea-bcf3-0ec4f896afb1


In [22]:
# Identifying the last post 
reddit_dom_dict['data']['after']

't3_h9ylpi'

In [57]:
# Confirming that the previous output is truly the last post
pd.DataFrame(dv_posts)['name']

0     t3_fsrd59
1     t3_hcbfp4
2     t3_hbq0hr
3     t3_hbt42e
4     t3_hbjyoc
5     t3_hbx63d
6     t3_hbsmtk
7     t3_hbrbls
8     t3_hb9lzk
9     t3_hbktus
10    t3_hbbt8x
11    t3_hb4jml
12    t3_hb80rr
13    t3_hate94
14    t3_hajrlu
15    t3_hatxhi
16    t3_havmd6
17    t3_havx2u
18    t3_haut32
19    t3_ha9gjm
20    t3_ha1hmt
21    t3_haa2a8
22    t3_ha5ken
23    t3_h9vaqj
24    t3_ha1ga6
25    t3_h9ylpi
Name: name, dtype: object

In [58]:
# This is the new URL that gives us the next 25 posts - double checked that it works
dom_url + '?after=' + reddit_dom_dict['data']['after']

'https://www.reddit.com/r/domesticviolence.json?after=t3_h9ylpi'

In [83]:
# Creating a funcition to scrape a Reddit page
# Function loops through 25 posts at a time

def reddit_scrape(url, posts_list, num_scrapes):

    after = None

    for i in range(num_scrapes):
        if i == 0:
            print("SCRAPING {}".format(url))
    
        if after == None:
            current_url = url
        else:
            current_url = url + '?after=' + after

        res = requests.get(current_url, headers={'User-agent': 'Marianne'})
    
        if res.status_code != 200:
            print('Status error', res.status_code)
            break
    
        current_dict = res.json()
        current_posts = [p['data'] for p in current_dict['data']['children']]
        posts_list.extend(current_posts)
        after = current_dict['data']['after']
        
        # Generate a random sleep duration to look more 'natural'
        sleep_duration = random.randint(2,6)
        time.sleep(sleep_duration)

    print("Number of posts downloaded: {}".format(len(posts_list)))

In [84]:
# Calling reddit_scrape function on domestic violence subreddit

dom_posts = []
reddit_scrape(dom_url, dom_posts, 50)

SCRAPING https://www.reddit.com/r/domesticviolence.json
Number of posts downloaded: 1247


Intended to download 1000 posts and there seems to be 1247 posts. I suspect that there might be repeated posts and I'll be checking for duplicate rows.

In [236]:
# Convert list to dataframe

dom_df = pd.DataFrame(dom_posts)
print('Shape of dom_df:', dom_df.shape)
dom_df.head()

Shape of dom_df: (1247, 108)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,link_flair_template_id,author_cakeday,post_hint,preview
0,,domesticviolence,We know many of you are struggling to manage w...,t2_2egrzrvq,False,,0,False,COVID-19 RESOURCES FOR ABUSE VICTIMS,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1585710000.0,1,,False,,,,
1,,domesticviolence,Not sure if what I'm experiencing could be ver...,t2_6zd0zopf,False,,0,False,Is this verbal or emotional abuse or am I over...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1592610000.0,0,,False,,,,
2,,domesticviolence,,t2_3frxp7eu,False,,0,False,I’m very interested in this sub. Is it okay fo...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1592525000.0,0,,False,,,,
3,,domesticviolence,I had a case of sexual assault and domestic v...,t2_5d7wihfd,False,,0,False,Restraining order expired and he is posting vi...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1592537000.0,0,,False,,,,
4,,domesticviolence,\nThis is my life story.\n\nI am 30 year old f...,t2_1cerrsr4,False,,0,False,8 years of abusive marriage,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1592504000.0,0,,False,a7a37eae-5cda-11ea-bcf3-0ec4f896afb1,,,


In [237]:
# Check for unique entries 
print('There are {} unique posts in dom_df.'.format(len(dom_df['name'].unique())))

There are 996 unique posts in dom_df.


In [238]:
# Drop duplicate rows and reset index

dom_df.drop_duplicates(subset='name', inplace=True)
dom_df.reset_index(drop=True, inplace=True)

print('Shape of dom_df:', dom_df.shape)
dom_df

Shape of dom_df: (996, 108)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,link_flair_template_id,author_cakeday,post_hint,preview
0,,domesticviolence,We know many of you are struggling to manage w...,t2_2egrzrvq,False,,0,False,COVID-19 RESOURCES FOR ABUSE VICTIMS,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.585710e+09,1,,False,,,,
1,,domesticviolence,Not sure if what I'm experiencing could be ver...,t2_6zd0zopf,False,,0,False,Is this verbal or emotional abuse or am I over...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.592610e+09,0,,False,,,,
2,,domesticviolence,,t2_3frxp7eu,False,,0,False,I’m very interested in this sub. Is it okay fo...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.592525e+09,0,,False,,,,
3,,domesticviolence,I had a case of sexual assault and domestic v...,t2_5d7wihfd,False,,0,False,Restraining order expired and he is posting vi...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.592537e+09,0,,False,,,,
4,,domesticviolence,\nThis is my life story.\n\nI am 30 year old f...,t2_1cerrsr4,False,,0,False,8 years of abusive marriage,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.592504e+09,0,,False,a7a37eae-5cda-11ea-bcf3-0ec4f896afb1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
991,,domesticviolence,This wasn't the first time.\n\nThe first time ...,t2_11e40weh,False,,0,False,I slapped my husband and he punched me in the ...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.577014e+09,0,,False,,,,
992,,domesticviolence,I know a lot of people will say to instantly b...,t2_2hahift0,False,,0,False,My post was removed from r/relationships and t...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.576991e+09,0,,False,,,,
993,,domesticviolence,I’m tired. I’m so fucking tired. \n\nI spent 8...,t2_4pwno0oe,False,,0,False,I wish I never left him.,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.576951e+09,0,,False,4ff3f752-54f7-11e3-b490-12313b06ec01,,,
994,,domesticviolence,It's evening today and I took garbage bin to e...,t2_3y1e29t,False,,0,False,I called a police for other people's family vi...,[],...,https://www.reddit.com/r/domesticviolence/comm...,10658,1.576959e+09,0,,False,,,,


### 1.2 Extracting data from the r/depression subreddit page


We will be using the same method to extra data from the r/depression subreddit page.

In [191]:
dep_url = 'https://www.reddit.com/r/depression.json'

In [192]:
# Calling reddit_scrape function on depression subreddit

dep_posts = []
reddit_scrape(dep_url, dep_posts, 50)

SCRAPING https://www.reddit.com/r/depression.json
Number of posts downloaded: 1253


Similar to r/domesticviolence, I will be checking for duplicates in r/depression as well.

In [193]:
# Convert list to dataframe

dep_df = pd.DataFrame(dep_posts)
print('Shape of dep_df:', dep_df.shape)
dep_df.head()

Shape of dep_df: (1253, 103)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,author_cakeday
0,,depression,We understand that most people who reply immed...,t2_1t70,False,,0,False,Our most-broken and least-understood rules is ...,[],...,/r/depression/comments/doqwow/our_mostbroken_a...,no_ads,True,https://www.reddit.com/r/depression/comments/d...,648224,1572361000.0,0,,False,
1,,depression,Welcome to /r/depression's check-in post - a p...,t2_64qjj,False,,0,False,Regular Check-In Post,[],...,/r/depression/comments/exo6f1/regular_checkin_...,no_ads,True,https://www.reddit.com/r/depression/comments/e...,648224,1580649000.0,0,,False,
2,,depression,Even if some posts blow up and have a bit of a...,t2_5xpk5iif,False,,1,False,This sub is counterproductive,[],...,/r/depression/comments/hcco2h/this_sub_is_coun...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648224,1592614000.0,0,,False,
3,,depression,As i go down the rabbit hole of why any of thi...,t2_564vn2mq,False,,0,False,The more depressed i get the more music i list...,[],...,/r/depression/comments/hca12w/the_more_depress...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648224,1592605000.0,0,,False,
4,,depression,Idk if that makes sense. But I don't want to k...,t2_49njkrci,False,,0,False,"I don't wanna kill myself, I just don't wanna ...",[],...,/r/depression/comments/hbvhbr/i_dont_wanna_kil...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648224,1592547000.0,1,,False,


In [211]:
# Check for unique entries 
print('There are {} unique posts in dep_df.'.format(len(dep_df['name'].unique())))

There are 978 unique posts in dep_df.


In [196]:
# Drop duplicate rows and reset index

dep_df.drop_duplicates(subset='name', inplace=True)
dep_df.reset_index(drop=True, inplace=True)

print('Shape of dep_df:', dep_df.shape)
dep_df

Shape of dep_df: (978, 103)


Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,author_cakeday
0,,depression,We understand that most people who reply immed...,t2_1t70,False,,0,False,Our most-broken and least-understood rules is ...,[],...,/r/depression/comments/doqwow/our_mostbroken_a...,no_ads,True,https://www.reddit.com/r/depression/comments/d...,648224,1.572361e+09,0,,False,
1,,depression,Welcome to /r/depression's check-in post - a p...,t2_64qjj,False,,0,False,Regular Check-In Post,[],...,/r/depression/comments/exo6f1/regular_checkin_...,no_ads,True,https://www.reddit.com/r/depression/comments/e...,648224,1.580649e+09,0,,False,
2,,depression,Even if some posts blow up and have a bit of a...,t2_5xpk5iif,False,,1,False,This sub is counterproductive,[],...,/r/depression/comments/hcco2h/this_sub_is_coun...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648224,1.592614e+09,0,,False,
3,,depression,As i go down the rabbit hole of why any of thi...,t2_564vn2mq,False,,0,False,The more depressed i get the more music i list...,[],...,/r/depression/comments/hca12w/the_more_depress...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648224,1.592605e+09,0,,False,
4,,depression,Idk if that makes sense. But I don't want to k...,t2_49njkrci,False,,0,False,"I don't wanna kill myself, I just don't wanna ...",[],...,/r/depression/comments/hbvhbr/i_dont_wanna_kil...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648224,1.592547e+09,1,,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,,depression,It's been there since my depression developed ...,t2_12anxi,False,,0,False,"There is a constant warm, pressure feeling bet...",[],...,/r/depression/comments/hbdfq3/there_is_a_const...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648227,1.592482e+09,0,,False,
974,,depression,"Hey everyone, I'm very interested in giving th...",t2_remklmg,False,,0,False,Affordable Ketamine infusion therapy in US?,[],...,/r/depression/comments/hbap0s/affordable_ketam...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648227,1.592469e+09,0,,False,
975,,depression,I have been suicidal and depressive for a huge...,t2_676q9zcd,False,,0,False,Starting to believe my own delusions,[],...,/r/depression/comments/hbha82/starting_to_beli...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648227,1.592496e+09,0,,False,
976,,depression,It’s 1am and I just realized life isn’t even w...,t2_6pcvheta,False,,0,False,its 1am and I don’t even care anymore I’m gonn...,[],...,/r/depression/comments/hcgu1m/its_1am_and_i_do...,no_ads,False,https://www.reddit.com/r/depression/comments/h...,648227,1.592633e+09,0,,False,


There seems to be a difference in the number of columns between `dom_df` and `def_df` which I will be exploring further.

In [214]:
# Identify difference in columns
dom_df.columns.difference(dep_df.columns)

Index(['link_flair_template_id', 'post_hint', 'preview', 'thumbnail_height',
       'thumbnail_width'],
      dtype='object')

In [223]:
# Review the columns listed above
dom_df.loc[:,['link_flair_template_id', 'post_hint', 'preview', \
              'thumbnail_height','thumbnail_width']]

Unnamed: 0,link_flair_template_id,post_hint,preview,thumbnail_height,thumbnail_width
0,,,,,
1,,,,,
2,,,,,
3,,,,,
4,a7a37eae-5cda-11ea-bcf3-0ec4f896afb1,,,,
...,...,...,...,...,...
991,,,,,
992,,,,,
993,4ff3f752-54f7-11e3-b490-12313b06ec01,,,,
994,,,,,


In [226]:
# Print out null value counts for the 5 extra columms

print(dom_df['link_flair_template_id'].isnull().value_counts(),'\n')
print(dom_df['post_hint'].isnull().value_counts(),'\n')
print(dom_df['preview'].isnull().value_counts(),'\n')
print(dom_df['thumbnail_height'].isnull().value_counts(),'\n')
print(dom_df['thumbnail_width'].isnull().value_counts())

True     760
False    236
Name: link_flair_template_id, dtype: int64 

True     990
False      6
Name: post_hint, dtype: int64 

True     990
False      6
Name: preview, dtype: int64 

True    996
Name: thumbnail_height, dtype: int64 

True    996
Name: thumbnail_width, dtype: int64


The extra 5 columns in `dom_df` don't seem to have any relevant information and are mostly nulls. It's likely that they won't be used for the our classifier and I'll be dropping them next.

In [239]:
# Drop columns
dom_df.drop(['link_flair_template_id', 'post_hint', 'preview', \
              'thumbnail_height','thumbnail_width'], axis=1, inplace=True)
dom_df.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,author_cakeday
0,,domesticviolence,We know many of you are struggling to manage w...,t2_2egrzrvq,False,,0,False,COVID-19 RESOURCES FOR ABUSE VICTIMS,[],...,/r/domesticviolence/comments/fsrd59/covid19_re...,,True,https://www.reddit.com/r/domesticviolence/comm...,10658,1585710000.0,1,,False,
1,,domesticviolence,Not sure if what I'm experiencing could be ver...,t2_6zd0zopf,False,,0,False,Is this verbal or emotional abuse or am I over...,[],...,/r/domesticviolence/comments/hcbfp4/is_this_ve...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592610000.0,0,,False,
2,,domesticviolence,,t2_3frxp7eu,False,,0,False,I’m very interested in this sub. Is it okay fo...,[],...,/r/domesticviolence/comments/hbq0hr/im_very_in...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592525000.0,0,,False,
3,,domesticviolence,I had a case of sexual assault and domestic v...,t2_5d7wihfd,False,,0,False,Restraining order expired and he is posting vi...,[],...,/r/domesticviolence/comments/hbt42e/restrainin...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592537000.0,0,,False,
4,,domesticviolence,\nThis is my life story.\n\nI am 30 year old f...,t2_1cerrsr4,False,,0,False,8 years of abusive marriage,[],...,/r/domesticviolence/comments/hbjyoc/8_years_of...,,False,https://www.reddit.com/r/domesticviolence/comm...,10658,1592504000.0,0,,False,


### 1.3 Export Data

In [None]:
dom_df.to_csv('../data/suicide_watch.csv', index = False)
dep_df.to_csv('../data/depression.csv', index = False)