Import required libraries:

In [1]:
import requests
import json
import bs4
import os
import sqlite3
import threading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Set SCRAPE=True in order to download the data from FB

In [2]:
SCRAPE = True

UNIFIED_JSON_PATH = "posts.json"
LOGGING_INTERVAL = 150

Define a function to return an empty list if string is not JSON formatted:

In [3]:
def read_json(path):
    try:
        return json.loads(open(path, "r").read())
    except json.decoder.JSONDecodeError:
        return []

def json_append(path, post):
    data = read_json(path)
    data.append(post)
    with open(path, "w") as f:
        f.write(json.dumps(data))

Create a supplemental function to simplify API 

In [4]:
def get(url_):
    return json.loads(requests.get(url_).text)

Define gruop ID variables:

In [5]:
TECHNION_CONFESSIONS_ID = "134517547222780"   # Technion - Haifa
TAU_CONFESSIONS_ID = "561380070875128"        # Tel Aviv University
IDC_CONFESSIONS_ID = "199527394120566"        # Beintchumi - Herzlia
HUJI_CONFESSIONS_ID = "323288791493138"       # Hebrew University - Jerusalem
BGU_CONFESSIONS_ID = "151003595697352"        # Ben Gurion University - Beer Sheva
ARIEL_CONFESSIONS_ID = "465435713853175"      # Ariel University
HAIFA_CONFESSIONS_ID = "417201845375921"      # Haifa University
HADASSAH_CONFESSIONS_ID = "2044836585541512"  # Hadassah

Authenticate to Graph API:

In [6]:
# TOKEN = "EAACEdEose0cBAEIYEOmJ51kfOHPinwL2ychqVpFT0D9ezMjfe1pMKyH0lP2ZA9RuZBH3wQeJ0EfEboHEqDqy7Gq8PSjIcHgNv1lRyC3ZBMk0CdbdzWckHsqNTRatwPZBvkMBfic2P1jwg5vZAZCHKbHdvFfMjjsHGjeVsY7AqEbpWiUBEb0d1zsKJgJwQw3wTWs6ZBb4WSOOQZDZD"

s = "6cc937f2a9dbc9df92600f365c777d1a"
i = "652869818252649"
u = "https://graph.facebook.com/oauth/access_token?client_id={id}&client_secret={secret}&grant_type=client_credentials"

TOKEN = get(u.format(id=i, secret=s))['access_token']

Setup Graph API host and API node:

In [7]:
HOST = "https://graph.facebook.com/"
API_NODE = "v2.12/"
QUERY = "?fields=created_time,message_tags,message,shares,reactions.type(LIKE).limit(0).summary(1).as(like),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(SAD).limit(0).summary(1).as(sad),reactions.type(ANGRY).limit(0).summary(1).as(angry)&limit=10"

Define a list of the pages that are going to be scraped in a manner of (id, alias) tuples.

In [8]:
PAGES = [
    (TECHNION_CONFESSIONS_ID, "TECHNION"),
    (TAU_CONFESSIONS_ID, "TAU"),
    (IDC_CONFESSIONS_ID, "IDC"),
    (HUJI_CONFESSIONS_ID, "HUJI"),
    (BGU_CONFESSIONS_ID, "BGU"),
    (ARIEL_CONFESSIONS_ID, "ARIEL"),
    (HAIFA_CONFESSIONS_ID, "HAIFA"),
    (HADASSAH_CONFESSIONS_ID, "HADASSAH"),
]

Container for the scraped posts:

In [9]:
posts = []
last_pages = {}  # This is intended for adding more posts without calling for scraped pages

Create a supplemental builder functions for the API calls' URLs:

In [10]:
def build_url(page_id):
    return HOST + API_NODE + page_id + "/posts" + QUERY + "&access_token={}".format(TOKEN)

def build_comments_url(post_id):
    return HOST + API_NODE + post_id + "/comments" + QUERY + "&access_token={}".format(TOKEN)

Create a function that scrapes a general object (post or comment):

In [11]:
def query_object(raw_object, object_type, object_source, parent_id=None):
    obj = {}   
    
    obj['type'] = object_type
    obj['source'] = object_source
    
    obj['id'] = raw_object['id']
    obj['parent_id'] = parent_id
    
    try:
        obj['message'] = raw_object['message']
    except KeyError:
        obj['message'] = ''

    obj['created_time'] = raw_object['created_time']

    like = raw_object["like"]["summary"]["total_count"]
    love = raw_object["love"]["summary"]["total_count"]
    haha = raw_object["haha"]["summary"]["total_count"]
    wow = raw_object["wow"]["summary"]["total_count"]
    sad = raw_object["sad"]["summary"]["total_count"]
    angry = raw_object["angry"]["summary"]["total_count"]

    obj['like'] = like
    obj['love'] = love
    obj['haha'] = haha
    obj['wow'] = wow
    obj['sad'] = sad
    obj['angry'] = angry
    
    obj['total_reactions'] = sum([like, love, haha, wow, sad, angry])
    
    return obj

Now, create a fumction that will paginate through the posts of a URL and scrape them:

In [12]:
def get_posts(url, source):
    res = get(url)
    if 'data' in res:
        for i, raw_post in enumerate(res['data']):
            post = query_object(raw_post, "POST", source)
            posts.append(post)
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(source, len(posts)))
    if 'paging' in res:
        if 'next' in res['paging']:
            last_pages[source] = url
            return get_posts(res['paging']['next'], source)

Create a function that will scrape a tag of a user:

In [13]:
def parse_tag(raw_tag, comment_id, source):
    tag = {}
    
    tag["type"] = "TAG"
    tag["parent_id"] = comment_id
    tag["source"] = source
    
    try:
        tag["user_id"] = raw_tag["id"]
    except KeyError:
        tag["user_id"] = None
    
    try:
        tag["user_name"] = raw_tag["name"]
    except KeyError:
        tag["user_name"] = None
    
    return tag

Create a function that will call the API for each post comments, scrape them and get their tags:

In [14]:
def get_post_comments(url, post_id, post_source):
    json_path = "{}.json".format(post_source)
    res = get(url)
    if 'data' in res:
        for raw_comment in res['data']:
            comment = query_object(raw_comment, "COMMENT", post_source, parent_id=post_id)
            posts.append(comment)
            
            if raw_comment.get("message_tags"):
                for raw_tag in raw_comment.get("message_tags"):
                    tag = parse_tag(raw_tag, raw_comment["id"], post_source)
                    posts.append(tag)
            
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(post_source, len(posts)))
            
    if 'paging' in res:
        if 'next' in res['paging']:
            return get_post_comments(res['paging']['next'], post_id, post_source)

And now, create a function that will scrapes all of the posts' comments and comment tags:

In [15]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [16]:
def get_comments(posts_list):
    for post in posts_list:
        get_post_comments(build_comments_url(post["id"]), post["id"], post["source"])

I will use threading to query each Facebook Group in a different thread - this is the worker funciton:

If SCRAPE, the script will run the threads (stop them using >>taskkill /f /im -"python.exe"):

In [17]:
if SCRAPE:
    
    threads = []
    
    for node, name in PAGES:
        t = threading.Thread(target=get_posts, args=(build_url(node), name,))
        threads.append(t)
        t.start()
    
    for t in threads:
        t.join()
        
else:
    posts = pd.read_csv("posts.csv").to_json(orient='records')

TAU is on item #150
TAU is on item #300
TAU is on item #450
TAU is on item #600
HUJI is on item #750
HUJI is on item #900
TAU is on item #1050
IDC is on item #1200
IDC is on item #1350
TAU is on item #1500
HAIFA is on item #1650
HAIFA is on item #1800
TECHNION is on item #1950
TECHNION is on item #2100
IDC is on item #2250
HUJI is on item #2400
TECHNION is on item #2550
HAIFA is on item #2700
HADASSAH is on item #2850
BGU is on item #3000
HAIFA is on item #3150
BGU is on item #3300
BGU is on item #3450
ARIEL is on item #3600
BGU is on item #3750
ARIEL is on item #3900
HUJI is on item #4050
IDC is on item #4200
TECHNION is on item #4350
ARIEL is on item #4500
IDC is on item #4650
BGU is on item #4800
IDC is on item #4950
ARIEL is on item #5100
TAU is on item #5250
IDC is on item #5400
TECHNION is on item #5550
TECHNION is on item #5700
TECHNION is on item #5850
BGU is on item #6000
BGU is on item #6150


In [18]:
with open("posts.json", "w") as f:
    f.write(json.dumps(posts))

In [19]:
if SCRAPE:
    
    threads = []
    
    print("[II] Scraping comments...")
    for posts_list in chunks(posts, 20):
        t = threading.Thread(target=get_comments, args=(posts_list,))
        threads.append(t)
        t.start()

[II] Scraping comments...
TECHNION is on item #6300
HUJI is on item #6450
TECHNION is on item #6600
HAIFA is on item #6750
TECHNION is on item #6900
ARIEL is on item #7050
TAU is on item #7200
HUJI is on item #7350
TAU is on item #7500
IDC is on item #7650
IDC is on item #7800
BGU is on item #7950
BGU is on item #8100
HADASSAH is on item #8250
ARIEL is on item #8400
ARIEL is on item #8550
HADASSAH is on item #8700
IDC is on item #8850
TECHNION is on item #9000
IDC is on item #9150
BGU is on item #9300
ARIEL is on item #9450


Create a dataframe:

In [21]:
df = pd.DataFrame(posts)

In [22]:
df.to_sql("posts", sqlite3.connect("data.db"))

In [23]:
df.head()

Unnamed: 0,angry,created_time,haha,id,like,love,message,parent_id,sad,source,total_reactions,type,user_id,user_name,wow
0,0.0,2018-02-13T18:32:34+0000,0.0,465435713853175_470884616641618,7.0,0.0,"#2439 אחד החברה ניגש למועד ב על 90, שיהנה ויצל...",,1.0,ARIEL,8.0,POST,,,0.0
1,0.0,2018-02-13T18:31:51+0000,6.0,465435713853175_470884349974978,30.0,0.0,#2438 מאז שעברנו לספריה החדשה אני חולמת שיום א...,,0.0,ARIEL,36.0,POST,,,0.0
2,0.0,2018-02-13T18:31:33+0000,0.0,465435713853175_470884173308329,6.0,0.0,#2437 מתחיל לאבד את זה אחרי ארבעה נכשלים. \n#ע...,,0.0,ARIEL,6.0,POST,,,0.0
3,0.0,2018-02-13T18:31:21+0000,0.0,465435713853175_470884116641668,3.0,0.0,#2436 כשאת מודיעה לכל התואר שנכנס הכסף אחרי צב...,,0.0,ARIEL,3.0,POST,,,0.0
4,0.0,2018-02-13T18:31:10+0000,2.0,465435713853175_470884063308340,14.0,0.0,#2435 הצעת חוק: להקצות קומה שלמה בספריה לאנשים...,,0.0,ARIEL,16.0,POST,,,0.0
