Import required libraries:

In [None]:
import requests
import json
import bs4
import os
import threading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Set SCRAPE=True in order to download the data from FB

In [None]:
SCRAPE = True

Define a function to return an empty list if string is not JSON formatted:

In [None]:
def read_json(path):
    try:
        return json.loads(open(path, "r").read())
    except json.decoder.JSONDecodeError:
        return []

def json_append(path, post):
    data = read_json(path)
    data.append(post)
    with open(path, "w") as f:
        f.write(json.dumps(data) )

Try to read "posts.json". If it does not exist, create it.

In [None]:
try:
    posts = read_json("posts.json")
except FileNotFoundError:
    f = open("posts.json", "w")
    f.close
    posts = []

Create a supplemental function to simplify API 

In [None]:
def get(url_):
    return json.loads(requests.get(url_).text)

Define gruop ID variables:

In [None]:
TECHNION_CONFESSIONS_ID = "134517547222780"
TAU_CONFESSIONS_ID = "561380070875128"
IDC_CONFESSIONS_ID = "199527394120566"
HUJI_CONFESSIONS_ID = "323288791493138"
BGU_CONFESSIONS_ID = "151003595697352"

Authenticate to Graph API:

In [None]:
# TOKEN = "EAACEdEose0cBAEIYEOmJ51kfOHPinwL2ychqVpFT0D9ezMjfe1pMKyH0lP2ZA9RuZBH3wQeJ0EfEboHEqDqy7Gq8PSjIcHgNv1lRyC3ZBMk0CdbdzWckHsqNTRatwPZBvkMBfic2P1jwg5vZAZCHKbHdvFfMjjsHGjeVsY7AqEbpWiUBEb0d1zsKJgJwQw3wTWs6ZBb4WSOOQZDZD"

s = "6cc937f2a9dbc9df92600f365c777d1a"
i = "652869818252649"
u = "https://graph.facebook.com/oauth/access_token?client_id={id}&client_secret={secret}&grant_type=client_credentials"

TOKEN = get(u.format(id=i, secret=s))['access_token']

Setup Graph API host and API node:

In [None]:
HOST = "https://graph.facebook.com/"
API_NODE = "v2.12/"
QUERY = "?fields=created_time,message_tags,message,shares,reactions.type(LIKE).limit(0).summary(1).as(like),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(SAD).limit(0).summary(1).as(sad),reactions.type(ANGRY).limit(0).summary(1).as(angry)&limit=10"

Declare custom node - in this case, the 'Technion Confessions' group feed:

In [None]:
PAGES = [
    (TECHNION_CONFESSIONS_ID, "TECHNION"),
    (TAU_CONFESSIONS_ID, "TAU"),
    (IDC_CONFESSIONS_ID, "IDC"),
    (HUJI_CONFESSIONS_ID, "HUJI"),
    (BGU_CONFESSIONS_ID, "BGU"),
]

In [None]:
for i, name in PAGES:
    json_name = "{}.json".format(name)
    f = open(json_name, "w")
    f.close()

Build the requested URL - should be the same for the most of the calls:

In [None]:
def build_url(page_id):
    return HOST + API_NODE + page_id + "/posts" + QUERY + "&access_token={}".format(TOKEN)

def build_comments_url(post_id):
    return HOST + API_NODE + post_id + "/comments" + QUERY + "&access_token={}".format(TOKEN)

Create a function that scrapes a general object (post or comment):

In [None]:
def query_object(raw_object, object_type, object_origin, parent_id=None):
    obj = {}   
    
    obj['type'] = object_type
    obj['origin'] = object_origin
    
    obj['id'] = raw_object['id']
    obj[parent_id] = parent_id
    
    try:
        obj['message'] = raw_object['message']
    except KeyError:
        obj['message'] = ''

    obj['created_time'] = raw_object['created_time']

    like = raw_object["like"]["summary"]["total_count"]
    love = raw_object["love"]["summary"]["total_count"]
    haha = raw_object["haha"]["summary"]["total_count"]
    wow = raw_object["wow"]["summary"]["total_count"]
    sad = raw_object["sad"]["summary"]["total_count"]
    angry = raw_object["angry"]["summary"]["total_count"]

    obj['like'] = like
    obj['love'] = love
    obj['haha'] = haha
    obj['wow'] = wow
    obj['sad'] = sad
    obj['angry'] = angry
    
    obj['total_reactions'] = sum([like, love, haha, wow, sad, angry])
    
    return obj

Now, create a fumction that will paginate through the posts of a URL and scrape them:

In [None]:
def get_posts(url, origin):
    json_path = "{}.json".format(origin)
    res = get(url)
    if 'data' in res:
        for raw_post in res['data']:
            post = query_object(raw_post, "POST", origin)
            json_append(json_path, post)
    if 'paging' in res:
        if 'next' in res['paging']:
            return get_posts(res['paging']['next'], origin)

Create a function that will scrape a tag of a user:

In [None]:
def parse_tag(raw_tag):
    tag = {}
    
    tag["type"] = "TAG"
    tag["parent_id"] = raw_comment["id"]

    tag["user_id"] = raw_tag["id"]
    tag["user_name"] = raw_tag["name"]
    
    return tag

Create a function that will call the API for each post comments, scrape them and get their tags:

In [None]:
def get_post_comments(url, post_id, post_origin):
    json_path = "{}.json".format(post_origin)
    res = get(url)
    if 'data' in res:
        for raw_comment in res['data']:
            comment = query_object(raw_comment, "COMMENT", post_origin, parent_id=post_id)
            json_append(json_path, comment)

            for raw_tag in raw_comment.get("message_tags"):
                tag = parse_tag(raw_tag)
                json_append(json_path, tag)
            
    if 'paging' in res:
        if 'next' in res['paging']:
            return get_post_comments(res['paging']['next'], post_id, post_origin)

And now, create a function that will scrapes all of the posts' comments and comment tags:

In [None]:
def get_comments(path):
    for post in read_json(path):
        get_post_comments(build_comments_url(post["id"]), post["id"], post["origin"])

I will use threading to query each Facebook Group in a different thread - this is the worker funciton:

In [None]:
def group_scraper_worker(node, name):
    get_posts(build_url(node), name)
    get_comments("{}.json".format(name))

If SCRAPE, the script will run the threads (stop them using >>taskkill /f /im -"python.exe"):

In [None]:
if SCRAPE:
    threads = []
    for node, name in PAGES:
        t = threading.Thread(target=group_scraper_worker, args=(node, name,))
        threads.append(t)
        t.start()

Show the head of the posts table:

In [None]:
# df = pd.read_json(open("posts.json", "r").read())

In [None]:
# ID_REGEX = "(#\d+|\d+#){1}"
# df["id"] = df["message"].str.findall(ID_REGEX).str.get(0)
# df["reply_to"] = df["message"].str.findall(ID_REGEX).str.get(1)
# df.head()

In [None]:
# df.groupby(["origin"]).agg(["count"])["id"]

Get the IDs

Get ID of the posts that the post is replying to

Get the posts likes

Get the comments of the posts

Get the likes for each comment

Get the replies for each comment

Get the name of the commenter (for each comment or reply)

Get the commenter gender

Get the posts a user reacted to or commented to

a list of the users commented or reacted