Import required libraries:

In [1]:
import requests
import json
import bs4
import os
import sqlite3
import threading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Set SCRAPE=True in order to download the data from FB

In [2]:
SCRAPE = True

UNIFIED_JSON_PATH = "posts.json"
LOGGING_INTERVAL = 150

Define a function to return an empty list if string is not JSON formatted:

In [3]:
def read_json(path):
    try:
        return json.loads(open(path, "r").read())
    except json.decoder.JSONDecodeError:
        return []

def json_append(path, post):
    data = read_json(path)
    data.append(post)
    with open(path, "w") as f:
        f.write(json.dumps(data))

Create a supplemental function to simplify API 

In [4]:
def get(url_):
    return json.loads(requests.get(url_).text)

Define gruop ID variables:

In [5]:
TECHNION_CONFESSIONS_ID = "134517547222780"
TAU_CONFESSIONS_ID = "561380070875128"
IDC_CONFESSIONS_ID = "199527394120566"
HUJI_CONFESSIONS_ID = "323288791493138"
BGU_CONFESSIONS_ID = "151003595697352"

Authenticate to Graph API:

In [6]:
# TOKEN = "EAACEdEose0cBAEIYEOmJ51kfOHPinwL2ychqVpFT0D9ezMjfe1pMKyH0lP2ZA9RuZBH3wQeJ0EfEboHEqDqy7Gq8PSjIcHgNv1lRyC3ZBMk0CdbdzWckHsqNTRatwPZBvkMBfic2P1jwg5vZAZCHKbHdvFfMjjsHGjeVsY7AqEbpWiUBEb0d1zsKJgJwQw3wTWs6ZBb4WSOOQZDZD"

s = "6cc937f2a9dbc9df92600f365c777d1a"
i = "652869818252649"
u = "https://graph.facebook.com/oauth/access_token?client_id={id}&client_secret={secret}&grant_type=client_credentials"

TOKEN = get(u.format(id=i, secret=s))['access_token']

Setup Graph API host and API node:

In [7]:
HOST = "https://graph.facebook.com/"
API_NODE = "v2.12/"
QUERY = "?fields=created_time,message_tags,message,shares,reactions.type(LIKE).limit(0).summary(1).as(like),reactions.type(LOVE).limit(0).summary(1).as(love),reactions.type(HAHA).limit(0).summary(1).as(haha),reactions.type(WOW).limit(0).summary(1).as(wow),reactions.type(SAD).limit(0).summary(1).as(sad),reactions.type(ANGRY).limit(0).summary(1).as(angry)&limit=10"

Define a list of the pages that are going to be scraped in a manner of (id, alias) tuples.

In [8]:
PAGES = [
    (TECHNION_CONFESSIONS_ID, "TECHNION"),
    (TAU_CONFESSIONS_ID, "TAU"),
    (IDC_CONFESSIONS_ID, "IDC"),
    (HUJI_CONFESSIONS_ID, "HUJI"),
    (BGU_CONFESSIONS_ID, "BGU"),
]

Container for the scraped posts:

In [9]:
posts = []
last_pages = {}  # This is intended for adding more posts without calling for scraped pages

Create a supplemental builder functions for the API calls' URLs:

In [10]:
def build_url(page_id):
    return HOST + API_NODE + page_id + "/posts" + QUERY + "&access_token={}".format(TOKEN)

def build_comments_url(post_id):
    return HOST + API_NODE + post_id + "/comments" + QUERY + "&access_token={}".format(TOKEN)

Create a function that scrapes a general object (post or comment):

In [11]:
def query_object(raw_object, object_type, object_source, parent_id=None):
    obj = {}   
    
    obj['type'] = object_type
    obj['source'] = object_source
    
    obj['id'] = raw_object['id']
    obj['parent_id'] = parent_id
    
    try:
        obj['message'] = raw_object['message']
    except KeyError:
        obj['message'] = ''

    obj['created_time'] = raw_object['created_time']

    like = raw_object["like"]["summary"]["total_count"]
    love = raw_object["love"]["summary"]["total_count"]
    haha = raw_object["haha"]["summary"]["total_count"]
    wow = raw_object["wow"]["summary"]["total_count"]
    sad = raw_object["sad"]["summary"]["total_count"]
    angry = raw_object["angry"]["summary"]["total_count"]

    obj['like'] = like
    obj['love'] = love
    obj['haha'] = haha
    obj['wow'] = wow
    obj['sad'] = sad
    obj['angry'] = angry
    
    obj['total_reactions'] = sum([like, love, haha, wow, sad, angry])
    
    return obj

Now, create a fumction that will paginate through the posts of a URL and scrape them:

In [12]:
def get_posts(url, source):
    res = get(url)
    if 'data' in res:
        for i, raw_post in enumerate(res['data']):
            post = query_object(raw_post, "POST", source)
            posts.append(post)
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(source, len(posts)))
    if 'paging' in res:
        if 'next' in res['paging']:
            last_pages[source] = url
            return get_posts(res['paging']['next'], source)

Create a function that will scrape a tag of a user:

In [13]:
def parse_tag(raw_tag, comment_id, source):
    tag = {}
    
    tag["type"] = "TAG"
    tag["parent_id"] = comment_id
    tag["source"] = source
    
    try:
        tag["user_id"] = raw_tag["id"]
    except KeyError:
        tag["user_id"] = None
    
    try:
        tag["user_name"] = raw_tag["name"]
    except KeyError:
        tag["user_name"] = None
    
    return tag

Create a function that will call the API for each post comments, scrape them and get their tags:

In [14]:
def get_post_comments(url, post_id, post_source):
    json_path = "{}.json".format(post_source)
    res = get(url)
    if 'data' in res:
        for raw_comment in res['data']:
            comment = query_object(raw_comment, "COMMENT", post_source, parent_id=post_id)
            posts.append(comment)
            
            if raw_comment.get("message_tags"):
                for raw_tag in raw_comment.get("message_tags"):
                    tag = parse_tag(raw_tag, raw_comment["id"], post_source)
                    posts.append(tag)
            
            if len(posts) % LOGGING_INTERVAL == 0:
                print("{} is on item #{}".format(post_source, len(posts)))
            
    if 'paging' in res:
        if 'next' in res['paging']:
            return get_post_comments(res['paging']['next'], post_id, post_source)

And now, create a function that will scrapes all of the posts' comments and comment tags:

In [15]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

In [16]:
def get_comments(posts_list):
    for post in posts_list:
        get_post_comments(build_comments_url(post["id"]), post["id"], post["source"])

I will use threading to query each Facebook Group in a different thread - this is the worker funciton:

If SCRAPE, the script will run the threads (stop them using >>taskkill /f /im -"python.exe"):

In [17]:
if SCRAPE:
    
    threads = []
    
    for node, name in PAGES:
        t = threading.Thread(target=get_posts, args=(build_url(node), name,))
        threads.append(t)
        t.start()
    
    for t in threads:
        t.join()
        
else:
    posts = pd.read_csv("posts.csv").to_json(orient='records')

HUJI is on item #150
HUJI is on item #300
TAU is on item #450
TAU is on item #600
TAU is on item #750
IDC is on item #900
TAU is on item #1050
IDC is on item #1200
IDC is on item #1350
IDC is on item #1500
TECHNION is on item #1650
IDC is on item #1800
TECHNION is on item #1950
BGU is on item #2100
BGU is on item #2250
HUJI is on item #2400
TAU is on item #2550
HUJI is on item #2700
HUJI is on item #2850
BGU is on item #3000
TAU is on item #3150
TAU is on item #3300
TAU is on item #3450
HUJI is on item #3600
BGU is on item #3750
TECHNION is on item #3900
BGU is on item #4050
TECHNION is on item #4200
BGU is on item #4350


In [18]:
with open("posts.json", "w") as f:
    f.write(json.dumps(posts))

In [19]:
if SCRAPE:
    
    threads = []
    
    print("[II] Scraping comments...")
    for posts_list in chunks(posts, 20):
        t = threading.Thread(target=get_comments, args=(posts_list,))
        threads.append(t)
        t.start()

[II] Scraping comments...
BGU is on item #4500
TAU is on item #4650
IDC is on item #4800
IDC is on item #4950
HUJI is on item #5100
TECHNION is on item #5250
BGU is on item #5400
IDC is on item #5550
BGU is on item #5700
TAU is on item #5850


Create a dataframe:

In [22]:
df = pd.DataFrame(posts)

In [32]:
df.to_sql("posts", sqlite3.connect("data.db"))

In [26]:
df.head()

Unnamed: 0,angry,created_time,haha,id,like,love,message,parent_id,sad,source,total_reactions,type,user_id,user_name,wow
0,0.0,2018-02-13T21:10:00+0000,0.0,199527394120566_207322916674347,0.0,0.0,#927 לפעמים אני מעדיף קפה בקרנף כי קפה בשנקל ז...,,0.0,IDC,0.0,POST,,,0.0
1,0.0,2018-02-13T21:00:00+0000,0.0,199527394120566_207322670007705,0.0,0.0,#926 לפעמים אנשים שואלים שאלות בקבוצה של המסלו...,,0.0,IDC,0.0,POST,,,0.0
2,0.0,2018-02-13T20:50:00+0000,0.0,199527394120566_207322613341044,1.0,0.0,#925 אני וידיד שלי מהמסלול למדנו אצלו למבחן וכ...,,0.0,IDC,1.0,POST,,,0.0
3,0.0,2018-02-13T20:40:00+0000,1.0,199527394120566_207322470007725,3.0,0.0,#924 מחכה ליום שהאדמין ידרוש כופר בתמורה לאי פ...,,0.0,IDC,4.0,POST,,,0.0
4,0.0,2018-02-13T20:27:00+0000,1.0,199527394120566_207303816676257,1.0,0.0,#923 אני וידיד שלי מהמסלול למדנו אצלו למבחן וכ...,,0.0,IDC,2.0,POST,,,0.0


In [27]:
ID_REGEX = "(#\d+|\d+#){1}"
df["id"] = df["message"].str.findall(ID_REGEX).str.get(0)
df["reply_to"] = df["message"].str.findall(ID_REGEX).str.get(1)
df.head()

Unnamed: 0,angry,created_time,haha,id,like,love,message,parent_id,sad,source,total_reactions,type,user_id,user_name,wow,reply_to
0,0.0,2018-02-13T21:10:00+0000,0.0,#927,0.0,0.0,#927 לפעמים אני מעדיף קפה בקרנף כי קפה בשנקל ז...,,0.0,IDC,0.0,POST,,,0.0,
1,0.0,2018-02-13T21:00:00+0000,0.0,#926,0.0,0.0,#926 לפעמים אנשים שואלים שאלות בקבוצה של המסלו...,,0.0,IDC,0.0,POST,,,0.0,
2,0.0,2018-02-13T20:50:00+0000,0.0,#925,1.0,0.0,#925 אני וידיד שלי מהמסלול למדנו אצלו למבחן וכ...,,0.0,IDC,1.0,POST,,,0.0,
3,0.0,2018-02-13T20:40:00+0000,1.0,#924,3.0,0.0,#924 מחכה ליום שהאדמין ידרוש כופר בתמורה לאי פ...,,0.0,IDC,4.0,POST,,,0.0,
4,0.0,2018-02-13T20:27:00+0000,1.0,#923,1.0,0.0,#923 אני וידיד שלי מהמסלול למדנו אצלו למבחן וכ...,,0.0,IDC,2.0,POST,,,0.0,


In [31]:
df.groupby(["source"]).agg(["count"])["id"]

Unnamed: 0_level_0,count
source,Unnamed: 1_level_1
BGU,987
HUJI,846
IDC,765
TAU,869
TECHNION,964


Get the IDs

Get ID of the posts that the post is replying to

Get the posts likes

Get the comments of the posts

Get the likes for each comment

Get the replies for each comment

Get the name of the commenter (for each comment or reply)

Get the commenter gender

Get the posts a user reacted to or commented to

a list of the users commented or reacted