Import required libraries:

In [None]:
import requests
import json
import bs4
import os
import sqlite3
import threading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Set SCRAPE=True in order to download the data from FB

In [None]:
SCRAPE = True

UNIFIED_JSON_PATH = "posts.json"
LOGGING_INTERVAL = 250

Define a function to return an empty list if string is not JSON formatted:

In [None]:
def read_json(path):
    try:
        return json.loads(open(path, "r").read())
    except json.decoder.JSONDecodeError:
        return []

def json_append(path, post):
    data = read_json(path)
    data.append(post)
    with open(path, "w") as f:
        f.write(json.dumps(data))

Create a supplemental function to simplify API 

In [None]:
def get(url_):
    return json.loads(requests.get(url_).text)

Define gruop ID variables:

In [None]:
TECHNION_CONFESSIONS_ID = "134517547222780"   # Technion - Haifa
TAU_CONFESSIONS_ID = "561380070875128"        # Tel Aviv University
IDC_CONFESSIONS_ID = "199527394120566"        # Beintchumi - Herzlia
HUJI_CONFESSIONS_ID = "323288791493138"       # Hebrew University - Jerusalem
BGU_CONFESSIONS_ID = "151003595697352"        # Ben Gurion University - Beer Sheva
ARIEL_CONFESSIONS_ID = "465435713853175"      # Ariel University
HAIFA_CONFESSIONS_ID = "417201845375921"      # Haifa University
HADASSAH_CONFESSIONS_ID = "2044836585541512"  # Hadassah

Authenticate to Graph API - it is preferred to use manual token from Graph API Explorer because it has better permissions:

In [None]:
TOKEN = "EAACEdEose0cBAKzOMUo0qRdzURj56BAZAgQOIdtS82DvwZA1ZB5pnmrPpsmRwASFhntWB2DkRNRCs3KigQ9aNZCR7WuzGoB0GSJGibUpJBQ0ZA0HMo4cLb0hSPz0qgkVZAJ3YNZAz2k89mZBXuBstXoLyZAQSJHNNPSZBTKVLJgfweNKRq9JAb7VFj86wH88iHLwUsZAqLE37BbzAZDZD"

# s = "6cc937f2a9dbc9df92600f365c777d1a"
# i = "652869818252649"
# u = "https://graph.facebook.com/oauth/access_token?client_id={id}&client_secret={secret}&grant_type=client_credentials"

# TOKEN = get(u.format(id=i, secret=s))['access_token']

Setup Graph API host and API node:

In [None]:
HOST = "https://graph.facebook.com/"
API_NODE = "v2.12/"
QUERY = "?fields=created_time,message_tags,message,shares,reactions.type(LIKE).limit(0).summary(1).as(like),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(SAD).limit(0).summary(1).as(sad),reactions.type(ANGRY).limit(0).summary(1).as(angry)&limit=10"

Define a list of the pages that are going to be scraped in a manner of (id, alias) tuples.

In [None]:
PAGES = [
    (TECHNION_CONFESSIONS_ID, "TECHNION"),
    (TAU_CONFESSIONS_ID, "TAU"),
    (IDC_CONFESSIONS_ID, "IDC"),
    (HUJI_CONFESSIONS_ID, "HUJI"),
    (BGU_CONFESSIONS_ID, "BGU"),
    (ARIEL_CONFESSIONS_ID, "ARIEL"),
    (HAIFA_CONFESSIONS_ID, "HAIFA"),
    (HADASSAH_CONFESSIONS_ID, "HADASSAH"),
]

Container for the scraped posts:

In [None]:
posts = []
last_pages = {}  # This is intended for adding more posts without calling for scraped pages

Create a supplemental builder functions for the API calls' URLs:

In [None]:
def build_url(page_id):
    return HOST + API_NODE + page_id + "/posts" + QUERY + "&access_token={}".format(TOKEN)

def build_comments_url(post_id):
    return HOST + API_NODE + post_id + "/comments" + QUERY + "&access_token={}".format(TOKEN)

Create a function that scrapes a general object (post or comment):

In [None]:
def query_object(raw_object, object_type, object_source, parent_id=None):
    obj = {}   
    
    obj['type'] = object_type
    obj['source'] = object_source
    
    obj['id'] = raw_object['id']
    obj['parent_id'] = parent_id
    
    try:
        obj['message'] = raw_object['message']
    except KeyError:
        obj['message'] = ''

    obj['created_time'] = raw_object['created_time']

    like = raw_object["like"]["summary"]["total_count"]
    love = raw_object["love"]["summary"]["total_count"]
    haha = raw_object["haha"]["summary"]["total_count"]
    wow = raw_object["wow"]["summary"]["total_count"]
    sad = raw_object["sad"]["summary"]["total_count"]
    angry = raw_object["angry"]["summary"]["total_count"]

    obj['like'] = like
    obj['love'] = love
    obj['haha'] = haha
    obj['wow'] = wow
    obj['sad'] = sad
    obj['angry'] = angry
    
    obj['total_reactions'] = sum([like, love, haha, wow, sad, angry])
    
    return obj

Now, create a fumction that will paginate through the posts of a URL and scrape them:

In [None]:
def get_posts(url, source):
    res = get(url)
    if 'data' in res:
        for i, raw_post in enumerate(res['data']):
            post = query_object(raw_post, "POST", source)
            posts.append(post)
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(source, len(posts)))
    if 'paging' in res:
        if 'next' in res['paging']:
            last_pages[source] = url
            return get_posts(res['paging']['next'], source)

In [None]:
def parse_tag(raw_tag, parent_id, parent_source):
    tag = {}
    
    tag["parent_id"] = parent_id
    tag["source"] = parent_source
    tag["type"] = "TAG"
    
    try:
        tag["tagged_user_id"] = raw_tag["id"]
    except KeyError:
        tag["tagged_user_id"] = None
        
    try:
        tag["tagged_user_name"] = raw_tag["name"]
    except KeyError:
        tag["tagged_user_name"] = None
            
    
    try:
        tag["tagged_user_type"] = raw_tag["type"]
    except KeyError:
        tag["tagged_user_type"] = None
            
    
    return tag

Create a function that will call the API for each post comments, scrape them and get their tags:

In [None]:
def get_post_comments(url, post_id, post_source):
    json_path = "{}.json".format(post_source)
    res = get(url)
    if 'data' in res:
        for raw_comment in res['data']:
            comment = query_object(raw_comment, "COMMENT", post_source, parent_id=post_id)
            posts.append(comment)
            
            if raw_comment.get("message_tags"):
                for raw_tag in raw_comment.get("message_tags"):
                    tag = parse_tag(raw_tag, raw_comment["id"], post_source)
                    posts.append(tag)
            
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(post_source, len(posts)))
            
    if 'paging' in res:
        if 'next' in res['paging']:
            return get_post_comments(res['paging']['next'], post_id, post_source)

And now, create a function that will scrapes all of the posts' comments and comment tags:

In [None]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [None]:
def get_comments(posts_list):
    for post in posts_list:
        get_post_comments(build_comments_url(post["id"]), post["id"], post["source"])

I will use threading to query each Facebook Group in a different thread - this is the worker funciton:

If SCRAPE, the script will run the threads (stop them using >>taskkill /f /im -"python.exe"):

In [None]:
if SCRAPE:
    
    threads = []
    
    for node, name in PAGES:
        t = threading.Thread(target=get_posts, args=(build_url(node), name,))
        threads.append(t)
        t.start()
    
    for t in threads:
        t.join()
        
else:
    posts = pd.read_csv("posts.csv").to_json(orient='records')

In [None]:
with open("posts.json", "w") as f:
    f.write(json.dumps(posts))

In [None]:
if SCRAPE:
    
    threads = []
    
    print("[II] Scraping comments...")
    for posts_list in chunks(posts, 20):
        t = threading.Thread(target=get_comments, args=(posts_list,))
        threads.append(t)
        t.start()

Create a dataframe:

In [None]:
df = pd.DataFrame(posts)

In [None]:
df.to_sql("posts", sqlite3.connect("data.db"))

In [None]:
df.head()