Import required libraries:

In [65]:
import requests
import json
import bs4
import os
import sqlite3
import threading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Set SCRAPE=True in order to download the data from FB

In [66]:
SCRAPE = True

UNIFIED_JSON_PATH = "posts.json"
LOGGING_INTERVAL = 250

Define a function to return an empty list if string is not JSON formatted:

In [67]:
def read_json(path):
    try:
        return json.loads(open(path, "r").read())
    except json.decoder.JSONDecodeError:
        return []

def json_append(path, post):
    data = read_json(path)
    data.append(post)
    with open(path, "w") as f:
        f.write(json.dumps(data))

Create a supplemental function to simplify API 

In [68]:
def get(url_):
    return json.loads(requests.get(url_).text)

Define gruop ID variables:

In [69]:
TECHNION_CONFESSIONS_ID = "134517547222780"      # Technion - Haifa
TAU_CONFESSIONS_ID = "561380070875128"           # Tel Aviv University
IDC_CONFESSIONS_ID = "199527394120566"           # Beintchumi - Herzlia
HUJI_CONFESSIONS_ID = "323288791493138"          # Hebrew University - Jerusalem
BGU_CONFESSIONS_ID = "151003595697352"           # Ben Gurion University - Beer Sheva
ARIEL_CONFESSIONS_ID = "465435713853175"         # Ariel University
HAIFA_CONFESSIONS_ID = "417201845375921"         # Haifa University
HADASSAH_CONFESSIONS_ID = "2044836585541512"     # Hadassah
ORT_BRAUDE_CONFESSIONS_ID = "1453869761388065"   # Ort-Braude
BAR_ILAN_CONFESSIONS_ID = "2124618411093048"     # Bar-Ilan

Authenticate to Graph API - it is preferred to use manual token from Graph API Explorer because it has better permissions:

In [70]:
TOKEN = "EAACEdEose0cBACSwnTEV6tqooruSSkKMlIbKE6mRuX7PmJ05ajvvTa8sCoHrCgQgSbQUjlEsgFNZBKxJGZCWbsRszhzSQBSIf4AH9dnX7sL0SrXx6OtBtCkeaWkbUDkp8dczHWZBEGwnC4h2sF2OWNA01uN1VeOJpx1XsnBdNMeEyJfniyTpv7ZAXcpm0oIZD"

# s = "6cc937f2a9dbc9df92600f365c777d1a"
# i = "652869818252649"
# u = "https://graph.facebook.com/oauth/access_token?client_id={id}&client_secret={secret}&grant_type=client_credentials"

# TOKEN = get(u.format(id=i, secret=s))['access_token']

Setup Graph API host and API node:

In [71]:
HOST = "https://graph.facebook.com/"
API_NODE = "v2.12/"
QUERY = "?fields=created_time,message_tags,message,shares,reactions.type(LIKE).limit(0).summary(1).as(like),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(SAD).limit(0).summary(1).as(sad),reactions.type(ANGRY).limit(0).summary(1).as(angry)&limit=10"

Define a list of the pages that are going to be scraped in a manner of (id, alias) tuples.

In [72]:
PAGES = [
    (TECHNION_CONFESSIONS_ID, "TECHNION"),
    (TAU_CONFESSIONS_ID, "TAU"),
    (IDC_CONFESSIONS_ID, "IDC"),
    (HUJI_CONFESSIONS_ID, "HUJI"),
    (BGU_CONFESSIONS_ID, "BGU"),
    (ARIEL_CONFESSIONS_ID, "ARIEL"),
    (HAIFA_CONFESSIONS_ID, "HAIFA"),
    (HADASSAH_CONFESSIONS_ID, "HADASSAH"),
    (ORT_BRAUDE_CONFESSIONS_ID, "BRAUDE"),
    (BAR_ILAN_CONFESSIONS_ID, "BIU"),
]

Container for the scraped posts:

In [73]:
posts = []
last_pages = {}  # This is intended for adding more posts without calling for scraped pages

Create a supplemental builder functions for the API calls' URLs:

In [74]:
def build_url(page_id):
    return HOST + API_NODE + page_id + "/posts" + QUERY + "&access_token={}".format(TOKEN)

def build_comments_url(post_id):
    return HOST + API_NODE + post_id + "/comments" + QUERY + "&access_token={}".format(TOKEN)

Create a function that scrapes a general object (post or comment):

In [75]:
def query_object(raw_object, object_type, object_source, parent_id=None):
    obj = {}   
    
    obj['type'] = object_type
    obj['source'] = object_source
    
    obj['id'] = raw_object['id']
    obj['parent_id'] = parent_id
    
    try:
        obj['message'] = raw_object['message']
    except KeyError:
        obj['message'] = ''

    obj['created_time'] = raw_object['created_time']

    like = raw_object["like"]["summary"]["total_count"]
    love = raw_object["love"]["summary"]["total_count"]
    haha = raw_object["haha"]["summary"]["total_count"]
    wow = raw_object["wow"]["summary"]["total_count"]
    sad = raw_object["sad"]["summary"]["total_count"]
    angry = raw_object["angry"]["summary"]["total_count"]

    obj['like'] = like
    obj['love'] = love
    obj['haha'] = haha
    obj['wow'] = wow
    obj['sad'] = sad
    obj['angry'] = angry
    
    obj['total_reactions'] = sum([like, love, haha, wow, sad, angry])
    
    return obj

Now, create a fumction that will paginate through the posts of a URL and scrape them:

In [76]:
def get_posts(url, source):
    res = get(url)
    if 'data' in res:
        for i, raw_post in enumerate(res['data']):
            post = query_object(raw_post, "POST", source)
            posts.append(post)
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(source, len(posts)))
    if 'paging' in res:
        if 'next' in res['paging']:
            last_pages[source] = url
            return get_posts(res['paging']['next'], source)

In [77]:
def parse_tag(raw_tag, parent_id, parent_source):
    tag = {}
    
    tag["parent_id"] = parent_id
    tag["source"] = parent_source
    tag["type"] = "TAG"
    
    try:
        tag["tagged_user_id"] = raw_tag["id"]
    except KeyError:
        tag["tagged_user_id"] = None
        
    try:
        tag["tagged_user_name"] = raw_tag["name"]
    except KeyError:
        tag["tagged_user_name"] = None
            
    
    try:
        tag["tagged_user_type"] = raw_tag["type"]
    except KeyError:
        tag["tagged_user_type"] = None
            
    
    return tag

Create a function that will call the API for each post comments, scrape them and get their tags:

In [78]:
def get_post_comments(url, post_id, post_source):
    json_path = "{}.json".format(post_source)
    res = get(url)
    if 'data' in res:
        for raw_comment in res['data']:
            comment = query_object(raw_comment, "COMMENT", post_source, parent_id=post_id)
            posts.append(comment)
            
            if raw_comment.get("message_tags"):
                for raw_tag in raw_comment.get("message_tags"):
                    tag = parse_tag(raw_tag, raw_comment["id"], post_source)
                    posts.append(tag)
            
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(post_source, len(posts)))
            
    if 'paging' in res:
        if 'next' in res['paging']:
            return get_post_comments(res['paging']['next'], post_id, post_source)

And now, create a function that will scrapes all of the posts' comments and comment tags:

In [79]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [80]:
def get_comments(posts_list):
    for post in posts_list:
        get_post_comments(build_comments_url(post["id"]), post["id"], post["source"])

I will use threading to query each Facebook Group in a different thread - this is the worker funciton:

If SCRAPE, the script will run the threads (stop them using >>taskkill /f /im -"python.exe"):

In [81]:
if SCRAPE:
    
    threads = []
    
    for node, name in PAGES:
        t = threading.Thread(target=get_posts, args=(build_url(node), name,))
        threads.append(t)
        t.start()
    
    for t in threads:
        t.join()
        
else:
    posts = pd.read_csv("posts.csv").to_json(orient='records')

BIU is on item #250
BRAUDE is on item #500
BIU is on item #750
BIU is on item #1000


In [82]:
with open("posts.json", "w") as f:
    f.write(json.dumps(posts))

In [83]:
if SCRAPE:
    
    threads = []
    
    print("[II] Scraping comments...")
    for posts_list in chunks(posts, 20):
        t = threading.Thread(target=get_comments, args=(posts_list,))
        threads.append(t)
        t.start()

[II] Scraping comments...


Create a dataframe:

In [89]:
df = pd.DataFrame(posts)

In [90]:
df.to_sql("posts", sqlite3.connect("data.db"), if_exists="append")

In [86]:
df.head()

Unnamed: 0,angry,created_time,haha,id,like,love,message,parent_id,sad,source,tagged_user_id,tagged_user_name,tagged_user_type,total_reactions,type,wow
0,0.0,2018-02-14T09:39:00+0000,1.0,1453869761388065_1466778413430533,0.0,0.0,#319 מה עם הערומה מהמעונות? מישהו צילם?,,0.0,BRAUDE,,,,1.0,POST,0.0
1,0.0,2018-02-14T09:38:15+0000,0.0,1453869761388065_1466778086763899,0.0,0.0,"#318 בבראודה, יותר מכל מקום אחר, אני נתקל בשיר...",,0.0,BRAUDE,,,,0.0,POST,0.0
2,0.0,2018-02-14T09:37:43+0000,0.0,1453869761388065_1466777826763925,1.0,0.0,#317 מאז שהסינים כבר לא במכללה נגמרו לי חיי המין.,,0.0,BRAUDE,,,,1.0,POST,0.0
3,0.0,2018-02-14T09:37:08+0000,2.0,1453869761388065_1466776843430690,3.0,0.0,#316 אני עתודאי במעונות שנה א'. יש לי שותף די ...,,0.0,BRAUDE,,,,5.0,POST,0.0
4,0.0,2018-02-14T09:29:38+0000,0.0,1453869761388065_1466773130097728,1.0,0.0,#315 לפני כמה חודשים עברתי ליד החדר טלויזיה במ...,,0.0,BRAUDE,,,,1.0,POST,0.0


BIU is on item #2000
BRAUDE is on item #2250
BIU is on item #2750
BIU is on item #3250
BIU is on item #3500
BRAUDE is on item #4000
BRAUDE is on item #4500
BIU is on item #5250
BRAUDE is on item #5500
BIU is on item #6500
