# 0. Script for extracting Reddit data

## Set Up

In [7]:
import numpy as np
import pandas as pd
import requests
from time import sleep
import datetime

## Extracting submissions (posts)

To standardise the content that users will be responding to in their comments, I will be extracting posts that have the **same content on both Reddit and Facebook**. These posts must meet the following conditions:
- Contain only **links** to Straits Times and Channel News Asia articles reporting on official Ministry of Health (MOH) announcements of COVID-19 restrictions in Singapore. There should be no further discussion about the article in the post.
- The linked articles should only be about **imposition/tightening** of restrictions and not loosening of the restrictions, and should only report the announcement of the restrictions i.e. no opinion pieces or follow-up comments by ministers).

Querying the Pushshift Reddit API, I searched for posts from May 2021 to May 2022 that contained at least one of the following search terms in the title: 'covid', 'phase' (as Singapore's restrictions were announced as part of Phases), 'measures', 'restrictions', 'suspended' (some restrictions involved suspension of visits to medical institutions).

In [None]:
res_dfs = []

date = datetime.datetime(2021,5,1,0,0,0)
while(date <= datetime.datetime(2022,5,1,0,0,0)):
    start_date = date
    date += datetime.timedelta(days=5)
    end_date = date 

    url = 'https://api.pushshift.io/reddit/search/submission/'
    query = {'subreddit': 'singapore',
             'after': start_date,
             'before': end_date,
             'title': 'covid',
             'num_comments': '>0',
             'fields': ['id', 'author', 'title', 'num_comments', 'score', 'url', 'full_link', 'created_utc'],
             'size': 100,
             'sort': 'asc',
             'sort_type': 'created_utc'}
    res = requests.get(url=url, params=query)
    res = res.json()
    res_df = pd.DataFrame(res['data'])
    res_dfs.append(res_df)
    
    sleep(10)
    
res_dfs = pd.concat(res_dfs)
res_dfs.reset_index(drop=True, inplace=True)

In [None]:
res_selected = res_dfs[res_dfs['num_comments'] > 0]
res_selected = res_selected[res_selected['url'].str.contains('channelnewsasia|straitstimes|cna', na = False)]

In [None]:
res_selected.to_csv('../data/posts_reddit_covid_all.csv')

## Extracting comments

After extracting all the Reddit posts with the designated search terms, I manually looked through all of the post titles and selected only those which met the aforementioned criteria - posts that only contained links to articles reporting on official MOH announcements of tightening of restrictions.

I then found links to the corresponding Facebook posts from official Straits Times and Channel News Asia Facebook pages that contained the same article as the Reddit posts. The reddit post data and the corresponding Facebook post links for each Reddit post were saved as `posts_reddit_fb_selected.csv`.

### Import selected reddit posts data

In [None]:
reddit_selected = pd.read_csv('../data/posts_reddit_fb_selected.csv')

In [3]:
reddit_selected

Unnamed: 0,author,id,num_comments,score,selftext,title,url,created_sgt,full_link_reddit,full_link_fb
0,chailoren,n4li0v,923,1,,Singapore to cut social gathering size from 8 ...,https://www.straitstimes.com/singapore/health/...,4/5/2021 18:59,https://www.reddit.com/r/singapore/comments/n4...,https://www.facebook.com/TheStraitsTimes/posts...
1,Fawx13x,n4li5g,28,1,,"Cap of 5 people for social gatherings, househo...",https://www.channelnewsasia.com/singapore/cap-...,4/5/2021 18:59,https://www.reddit.com/r/singapore/comments/n4...,https://www.facebook.com/ChannelNewsAsia/posts...
2,485320,n52529,45,1,,Limit on employees who can return to workplace...,https://www.straitstimes.com/singapore/limit-o...,5/5/2021 8:00,https://www.reddit.com/r/singapore/comments/n5...,https://www.facebook.com/TheStraitsTimes/posts...
3,hahohehuhi,n61hcv,22,1,,COVID-19: Indoor sports facilities to close te...,https://www.channelnewsasia.com/news/singapore...,6/5/2021 15:09,https://www.reddit.com/r/singapore/comments/n6...,https://www.facebook.com/ChannelNewsAsia/posts...
4,Fawx13x,nc0vau,11,1,,"Group sizes down from 5 to 2, dining-in suspen...",https://www.channelnewsasia.com/news/singapore...,14/5/2021 13:06,https://www.reddit.com/r/singapore/comments/nc...,https://www.facebook.com/ChannelNewsAsia/posts...
5,sexyhades69,nc0veq,2,1,,"Group sizes down from 5 to 2, dining-in suspen...",https://www.channelnewsasia.com/news/singapore...,14/5/2021 13:07,https://www.reddit.com/r/singapore/comments/nc...,https://www.facebook.com/ChannelNewsAsia/posts...
6,shady-memes_v13,nc0vwe,1684,1,,"No dining in, social gatherings capped at 2 pe...",https://www.straitstimes.com/singapore/health/...,14/5/2021 13:08,https://www.reddit.com/r/singapore/comments/nc...,https://www.facebook.com/TheStraitsTimes/posts...
7,caifanconnoisseur,nc0wni,7,1,,"Only 2 visitors per household per day, no dini...",https://www.straitstimes.com/singapore/health/...,14/5/2021 13:09,https://www.reddit.com/r/singapore/comments/nc...,https://www.facebook.com/TheStraitsTimes/posts...
8,Raphiel_Shiraha_Ains,nc0y1s,1,1,,"Group sizes down from 5 to 2, dining-in suspen...",https://www.channelnewsasia.com/news/singapore...,14/5/2021 13:11,https://www.reddit.com/r/singapore/comments/nc...,https://www.facebook.com/ChannelNewsAsia/posts...
9,SMJLeo,ncctqd,43,1,,Fixed seating with one-metre spacing for reces...,https://www.straitstimes.com/singapore/fixed-s...,15/5/2021 0:34,https://www.reddit.com/r/singapore/comments/nc...,https://www.facebook.com/TheStraitsTimes/posts...


### Get comment IDs for each post

Having obtained the post ID for each post, I then queried the Pushshift API to obtain all the comment IDs across all selected posts, so that I could later use the comment ID to obtain the comment text.

In [None]:
comment_ids = {'post_id': [], 'comment_id': []}

for i in reddit_selected['id']:
    url = 'https://api.pushshift.io/reddit/submission/comment_ids/' + i
    res = requests.get(url=url)
    res = res.json()
    for j in res['data']:
        comment_ids['post_id'].append(i)
        comment_ids['comment_id'].append(j)
        
comment_ids_df = pd.DataFrame(comment_ids)

In [None]:
comment_ids_df

In [None]:
comment_ids_df.to_csv('../data/comments_reddit_ids.csv')

### Get comments by comment ID

Using the extracted comment IDs, I queried the Pushshift API to obtain the corresponding comment text. Due to query limits, the 9000+ comments were extracted 1000 at a time, then concatenated into a single dataframe at the end.

In [2]:
comment_ids_df = pd.read_csv('../data/comments_reddit_ids.csv', index_col=0)
comment_ids_df.head()

Unnamed: 0,post_id,comment_id
0,n4li0v,gww5drd
1,n4li0v,gww5jfz
2,n4li0v,gww68ne
3,n4li0v,gww6icr
4,n4li0v,gww6luf


In [8]:
comment_ids_df.shape

(9692, 2)

In [10]:
# Extract 1000 comments at a time
comments = {'comment_id': [], 'text': []}

idx = 8000
while (idx < 9692):
    start = idx
    idx += 1
    end = idx
    sample = comment_ids_df['comment_id'].iloc[start:end]
    
    for c_id in sample:
        url = 'https://api.pushshift.io/reddit/search/comment/'
        query = {'ids': c_id}
        res = requests.get(url=url, params=query)
        res = res.json()
        
        try:
            text = res['data'][0]['body']
        except IndexError:
            text = ''
        
        comments['comment_id'].append(str(c_id))
        comments['text'].append(str(text))
        comments_df_9 = pd.DataFrame(comments)
    
    sleep(2)
    
comments_df_9.to_csv('../data/reddit_comments_9.csv', encoding='utf-8-sig')
display(comments_df_9)

Unnamed: 0,comment_id,text
0,he3f96h,"Look at the fortresses like Australia, NZ, Tai..."
1,he3fesl,[removed]
2,he3fi6x,I'm ready for more shots! Poke me all you want!
3,he3fjy1,Whats the point though? What will this actuall...
4,he3fmz8,“All virus are good virus “
...,...,...
1687,hhck2rw,Are you fucking kidding me
1688,hhck41n,Glad our 2pax dine in restrictions are startin...
1689,hhck45v,Please tell me it is a joke
1690,hhckboo,OYK master🅱️lan


### Concatenating all comments

In [12]:
comments_df_1 = pd.read_csv('../data/comments_reddit_1.csv', index_col=0)
comments_df_2 = pd.read_csv('../data/comments_reddit_2.csv', index_col=0)
comments_df_3 = pd.read_csv('../data/comments_reddit_3.csv', index_col=0)
comments_df_4 = pd.read_csv('../data/comments_reddit_4.csv', index_col=0)
comments_df_5 = pd.read_csv('../data/comments_reddit_5.csv', index_col=0)
comments_df_6 = pd.read_csv('../data/comments_reddit_6.csv', index_col=0)
comments_df_7 = pd.read_csv('../data/comments_reddit_7.csv', index_col=0)
comments_df_8 = pd.read_csv('../data/comments_reddit_8.csv', index_col=0)
comments_df_9 = pd.read_csv('../data/comments_reddit.csv', index_col=0)

In [13]:
comments_df_list = [comments_df_1, comments_df_2, comments_df_3, comments_df_4, comments_df_5, comments_df_6, comments_df_7,
                   comments_df_8, comments_df_9]
comments_dfs = pd.concat(comments_df_list, axis=0, ignore_index=True)

In [14]:
comments_dfs.shape

(9692, 2)

In [15]:
comments_dfs.to_csv('../data/comments_reddit_all.csv', encoding='utf-8-sig')