# 1. Script for extracting FB data

## Set Up

In [1]:
from facebook_scraper import get_posts
import numpy as np
import pandas as pd
import requests
from time import sleep
import datetime

In [2]:
pd.set_option('display.max_colwidth', None)

## Extracting comments

### Import selected FB posts data

As mentioned in the script to extract Reddit data, I was only interested in extracting posts from Facebook and Reddit that contained the same article from Straits Times/Channel News Asia reporting on Ministry of Health (MOH) announcements about imposition/tightening of restrictions. The links to Facebook posts with matching Reddit posts were manually input into `posts_reddit_fb_selected.csv`.

In [3]:
fb_selected = pd.read_csv('../data/posts_reddit_fb_selected.csv')

In [4]:
fb_selected

Unnamed: 0,author,id,num_comments,title,url,created_sgt,full_link_reddit,full_link_fb
0,chailoren,n4li0v,923,Singapore to cut social gathering size from 8 to 5 amid rising Covid-19 cases; effective May 8-30,https://www.straitstimes.com/singapore/health/singapore-to-cut-social-gathering-size-from-8-to-5-amid-rising-covid-19-cases,4/5/2021 18:59,https://www.reddit.com/r/singapore/comments/n4li0v/singapore_to_cut_social_gathering_size_from_8_to/,https://www.facebook.com/TheStraitsTimes/posts/10157877541112115
1,Fawx13x,n4li5g,28,"Cap of 5 people for social gatherings, household visits to return as Singapore tightens COVID-19 measures",https://www.channelnewsasia.com/singapore/cap-of-5-people-social-gatherings-household-visits-covid-19-moh-1344191,4/5/2021 18:59,https://www.reddit.com/r/singapore/comments/n4li5g/cap_of_5_people_for_social_gatherings_household/,https://www.facebook.com/ChannelNewsAsia/posts/10158274255327934
2,485320,n52529,45,Limit on employees who can return to workplace back at 50%; firms urged to adhere to tighter Covid-19 rules,https://www.straitstimes.com/singapore/limit-on-employees-who-can-return-to-workplace-back-at-50-as-covid-19-measures-are,5/5/2021 8:00,https://www.reddit.com/r/singapore/comments/n52529/limit_on_employees_who_can_return_to_workplace/,https://www.facebook.com/TheStraitsTimes/posts/10157878977002115
3,hahohehuhi,n61hcv,22,"COVID-19: Indoor sports facilities to close temporarily, outdoor exercise classes to continue with reduced capacity",https://www.channelnewsasia.com/news/singapore/indoor-sports-facilities-close-outdoor-classes-allowed-covid-19-14754552,6/5/2021 15:09,https://www.reddit.com/r/singapore/comments/n61hcv/covid19_indoor_sports_facilities_to_close/,https://www.facebook.com/ChannelNewsAsia/posts/10158278202967934
4,Fawx13x,nc0vau,11,"Group sizes down from 5 to 2, dining-in suspended as Singapore tightens COVID-19 measures",https://www.channelnewsasia.com/news/singapore/covid-19-phase-2-dining-in-work-from-home-tightened-measures-1365476,14/5/2021 13:06,https://www.reddit.com/r/singapore/comments/nc0vau/group_sizes_down_from_5_to_2_diningin_suspended/,https://www.facebook.com/ChannelNewsAsia/posts/10158295565437934
5,sexyhades69,nc0veq,2,"Group sizes down from 5 to 2, dining-in suspended as Singapore tightens COVID-19 measures",https://www.channelnewsasia.com/news/singapore/covid-19-phase-2-dining-in-work-from-home-tightened-measures-14808382,14/5/2021 13:07,https://www.reddit.com/r/singapore/comments/nc0veq/group_sizes_down_from_5_to_2_diningin_suspended/,https://www.facebook.com/ChannelNewsAsia/posts/10158295565437934
6,shady-memes_v13,nc0vwe,1684,"No dining in, social gatherings capped at 2 people from May 16 as S'pore tightens Covid-19 rules",https://www.straitstimes.com/singapore/health/no-dining-in-social-gatherings-capped-at-2-people-from-may-16-as-spore-tightens,14/5/2021 13:08,https://www.reddit.com/r/singapore/comments/nc0vwe/no_dining_in_social_gatherings_capped_at_2_people/,https://www.facebook.com/TheStraitsTimes/posts/10157899793777115
7,caifanconnoisseur,nc0wni,7,"Only 2 visitors per household per day, no dining-in allowed: Covid-19 rules in S'pore from May 16",https://www.straitstimes.com/singapore/health/only-2-visitors-per-household-per-day-no-dining-in-allowed-covid-19-rules-in-spore,14/5/2021 13:09,https://www.reddit.com/r/singapore/comments/nc0wni/only_2_visitors_per_household_per_day_no_diningin/,https://www.facebook.com/TheStraitsTimes/posts/10157899892712115
8,Raphiel_Shiraha_Ains,nc0y1s,1,"Group sizes down from 5 to 2, dining-in suspended as Singapore tightens COVID-19 measures - CNA",https://www.channelnewsasia.com/news/singapore/covid-19-phase-2-dining-in-work-from-home-tightened-measures-14808382,14/5/2021 13:11,https://www.reddit.com/r/singapore/comments/nc0y1s/group_sizes_down_from_5_to_2_diningin_suspended/,https://www.facebook.com/ChannelNewsAsia/posts/10158295565437934
9,SMJLeo,ncctqd,43,"Fixed seating with one-metre spacing for recess, no intermingling, as MOE tightens measures to fight Covid-19",https://www.straitstimes.com/singapore/fixed-seating-with-one-metre-spacing-for-recess-no-intermingling-as-moe-tightens-measures,15/5/2021 0:34,https://www.reddit.com/r/singapore/comments/ncctqd/fixed_seating_with_onemetre_spacing_for_recess_no/,https://www.facebook.com/TheStraitsTimes/posts/10157900823922115


Since there are repeated links in the `full_link_fb` column, I will get the set of unique links and output it as a list.

In [6]:
fb_links = list(fb_selected['full_link_fb'].unique())
print(len(fb_links))

28


### Get comments for each post

I used the [facebook-scraper Python package](https://pypi.org/project/facebook-scraper/) to extract all the comments from each post using the URL for the post.

To better understand how the results were structured, I extracted comments from the first post in the list:

In [None]:
# import json
# import logging

# from facebook_scraper import get_posts, enable_logging

# enable_logging(logging.DEBUG)
# logging.basicConfig(filename="logs.txt", filemode='w', level=logging.DEBUG)

In [8]:
posts = get_posts(post_urls=[fb_links[0]], 
                  cookies='../data/cookies_fb.txt', 
                  options={'comments': True, 'allow_extra_requests': False, 'posts_per_page': 200})

for p in posts:
    print(p['comments'])
    print(len(p['comments_full']))
    print(p['time'])

502
250
2021-05-04 06:59:47


From the above we can see that:
- According to the `comments` parameter, there are 502 comments for the first post.
- Data on each comment is a nested json under the `comments_full` parameter. There are only 250 comments in `comments_full`, so these are top-level comments and the rest of the comments are comment replies nested in the `replies` parameter under each comment in `comments_full`.
- Comment replies have to be extracted by iterating through each comment in `comments_full`, but this will be extremely time consuming and tricky as Facebook has tight restrictions on scraping behaviour that makes it necessary to introduce long sleep times between each comment/reply extraction to prevent account banning. As such, I will **not** extract comment replies and only extract top level comments.

The following code extracts comments from the 28 Facebook posts, 5 posts at a time.

In [18]:
comments = {'comment_id': [], 'text': [], 'post_time': []}

idx = 25
while (idx < 28):
    sample = [fb_links[idx]]
    idx = idx+1
    for post in get_posts(post_urls=sample,
                          cookies='../data/cookies_fb.txt',
                          timeout=180,
                          options={'comments': 'generator', 'progress': True, 'allow_extra_requests': False, 'posts_per_page': 200}):
        
        comments_full = post['comments_full']
        
        for comment in comments_full:         
            comments['comment_id'].append(str(comment['comment_id']))
            comments['text'].append(str(comment['comment_text']))
            comments['post_time'].append(post['time'])
            sleep(3)
    
    sleep(30)

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/588 [00:00<?, ?it/s]

  0%|          | 0/272 [00:00<?, ?it/s]

In [20]:
# Export extracted comments to CSV
comments_df_6 = pd.DataFrame(comments)
comments_df_6.to_csv('../data/comments_fb_6.csv', encoding='utf-8-sig')

In [21]:
# Inspect extracted comments
display(comments_df_6.head())
display(comments_df_6.shape)

Unnamed: 0,comment_id,text,post_time
0,10158154349842115,How about the non graduating classes? Because they may have seniors at home?,2021-09-30 04:45:01
1,10158154323852115,Sebastian Wong,2021-09-30 04:45:01
2,10158154338022115,"AGAIN, for the nth time, and I will continue to do so:\n\nVaccination is not to make you IMMORTAL.\nVaccination does not PREVENT infection or death.\nIt’s to REDUCE your potential of infection/\ndeath.",2021-09-30 04:45:01
3,10158154412872115,How about secondary schools? We seem to have neglected this group.,2021-09-30 04:45:01
4,10158154350047115,https://\nmothership.sg/\n2021/09/\nseniors-stay-hom\ne-covid-19/,2021-09-30 04:45:01


(650, 3)

### Concatenating all comments

In [22]:
comments_df_1 = pd.read_csv('../data/comments_fb_1.csv', index_col=0)
comments_df_2 = pd.read_csv('../data/comments_fb_2.csv', index_col=0)
comments_df_3 = pd.read_csv('../data/comments_fb_3.csv', index_col=0)
comments_df_4 = pd.read_csv('../data/comments_fb_4.csv', index_col=0)
comments_df_5 = pd.read_csv('../data/comments_fb_5.csv', index_col=0)
comments_df_6 = pd.read_csv('../data/comments_fb_6.csv', index_col=0)

In [23]:
comments_df_list = [comments_df_1, comments_df_2, comments_df_3, comments_df_4, comments_df_5, comments_df_6]
comments_dfs = pd.concat(comments_df_list, axis=0, ignore_index=True)

In [24]:
# Remove empty comments
comments_dfs = comments_dfs.drop(comments_dfs[comments_dfs['text'].isna()].index)

In [25]:
comments_dfs.shape

(6607, 3)

In [26]:
comments_dfs.to_csv('../data/comments_fb_all.csv', encoding='utf-8-sig')