In [1]:
"""
*******************************************************
Title: Python Notebook to download posts associated with public profiles

Organization: DANE
Author: Andrés D. Pérez
Version: 3.0
Modification date: 07/21/2021
Descripción:
    [Sec 1] Libraries
    [Sec 2] Data Gathering
    [Sec 3] Dataset preprocessing and building
    
    Returns:
        Posts dataset for each profile
*******************************************************
"""

'\n*******************************************************\nTitle: Python Notebook to download posts associated with public profiles\n\nOrganization: DANE\nAuthor: Andrés D. Pérez\nVersion: 3.0\nModification date: 07/21/2021\nDescripción:\n    [Sec 1] Libraries\n    [Sec 2] Data Gathering\n    [Sec 3] Dataset preprocessing and building\n    \n    Returns:\n        Posts dataset for each profile\n*******************************************************\n'

# 1. Libraries

In [2]:
#Library to scrape data from Facebook
from facebook_scraper import *

#Library to estimate process progress
from tqdm import tqdm

#Data handling libraries
import pandas as pd
import numpy as np

#Data handling, time and random number generator libraries
import json, time, random

# 2. Data Gathering

In [3]:
"""
ok:
* Politics
gustavopetrourrego
AlvaroUribeVel
ivanduquemarquez
JuanManSantosC

* News
elespectadorcom
eltiempo
CanalRCN
NoticiasCaracol
Canal1Col
RevistaSemana

* Public order
Policianacionaldeloscolombianos
ejercitocolombia
FiscaliaCol

* Economics
RevistaDineroCol
Bancolombia
BancoDavivienda

* Goverment
MinisterioDeHaciendaYCreditoPublico
DIANCol
DANEColombia
RegistraduriaNacional
departamentonacionaldeplaneacion

* Sports
FCFSeleccionColPage
millosfcoficial
nacionaloficial
independiente.santafe
Americadecalisad
JuniorClubSA
DeportivoCaliOficial
oncecaldasoficial
fedecoltenis

* Artists
AMIGOSDEALEXCAMPOS
juanpisgonzalezWTF

* Other
parquemundoaventura
ParqueSalitreMagico
icetexcolombia
==============
--Noticias
--Finanzas
--Entidades Govierno
--Deportes
--Artistas

==============
error:
shakira
pirryoficial
10Jamesrodriguez
MALUMAMUSIK
GarciaZarateRadamelFalcao
Juanes
JBalvinOficial
susoelpaspioficial
lopezman
marianapajon
"""


"""
New profiles

Mayors
ClaudiaLopezCL
DQuinteroCalle
JorgeIvanOspin
jaimepumarejoheins
JCardenasRey
JoseManuelRiosAlcalde
WilliamDauAlcalde
andreshurtadoalcalde
AlcaldeJorgeMendoza
jaimepumarejoheins
RuizLuisAntonio
quelleguelobueno
carlospenagos.si
WilliamDauAlcalde
MelloCastroG
JoseRamiroBermudezC
JohnsonVirna
gorkymunozc
andreshurtadoalcalde
EdgarTovarPedraza
lecYopal
harmanfelipe
Jair-Esteban-Beltran-Hinojosa-344509085732777
jc.lopezcastrillon
germanchamorrodlr
Martin-Emilio-Sánchez-Valencia-2251716985096279
JorgeIvanOspin
garialcalde2023

Public opinion
Antonio-Caballero-58050573565
GermanVargasLleras
PrensaJuanLozano
#31 perfiles nuevos
---------------------
error
ingjairoyanez
AlcaldeDeManizales
FunemeAlejandro
pablo.yuvabe
carlosandres.guevaragomez.5
MayaAlcaldePereira
ingjairoyanez

Public opinion
DanielCoronellPeriodista
mjduzan
DanielSamperO
saludhernandezmora1
vickydavilah
paginadesilvaromero
"""

listposts = []

#It's called the get_posts function from facebook_scraper over the last 1000 pages
#In between paging, wait 1 to 3 seconds to mitigate blocking by multiple requests.
for post in tqdm(get_posts("garialcalde2023", pages=1000, extra_info=True, timeout=10)):
    try:
        time.sleep(random.randint(0,3))
        listposts.append(post)
    except:
        pass

141it [05:46,  2.46s/it]


In [4]:
#Check the number of gathered posts
len(listposts)

141

In [5]:
#Lets check the data keys
listposts[0].keys()

dict_keys(['post_id', 'text', 'post_text', 'shared_text', 'time', 'image', 'video', 'video_thumbnail', 'video_id', 'likes', 'comments', 'shares', 'post_url', 'link', 'user_id', 'username', 'is_live', 'factcheck', 'shared_post_id', 'shared_time', 'shared_user_id', 'shared_username', 'shared_post_url', 'available', 'images'])

In [6]:
#Lets check the first post id
listposts[0]['post_id']

'141583840522602'

# 3. Dataset preprocessing and building

In [7]:
def check_ei(element, cols):
    '''
    Function to check if the element exists
    
    Get a json dict and a list of interest columns
    
    Iterate over each column, if the data exists, add the value to a list, otherwise add an error message
    
    Args:
        element (dict):   Json dictionary with data
        cols (list):      List with interes columns
    Returns:
        n_row_list (list):  List with pre processed and organized data
    '''
    n_row_list = []
    for x in cols:
        try:
            val = element[x]
        except:
            val = 'void_error'
        n_row_list.append(val)
    return n_row_list

In [8]:
#Interest columns definition
df_cols = ['post_id', 'text', 'post_text', 'shared_text', 'time', 'image',
            'video', 'video_thumbnail', 'video_id', 'likes', 'comments',
            'shares', 'post_url', 'link', 'user_id', 'username', 'is_live',
            'factcheck', 'shared_post_id', 'shared_time', 'shared_user_id',
            'shared_username', 'shared_post_url', 'available', 'images',
            'reactions', 'w3_fb_url', 'fetched_time']
#Post dataframe creation
posts_df = pd.DataFrame(columns=df_cols)

#For used to check elements and construct the posts dataset
for element in tqdm(listposts):
    try:
        element_list = check_ei(element, df_cols)
        n_row = pd.Series(element_list, index=posts_df.columns)
        posts_df = posts_df.append(n_row, ignore_index=True)
    except:
        pass

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 141/141 [00:00<00:00, 181.02it/s]


In [9]:
#Lets check the first 5 rows
posts_df.head()

Unnamed: 0,post_id,text,post_text,shared_text,time,image,video,video_thumbnail,video_id,likes,...,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,available,images,reactions,w3_fb_url,fetched_time
0,141583840522602,"DE LA MANO DE DIOS Y CON TU VOTO, ESTE 27 DE O...","DE LA MANO DE DIOS Y CON TU VOTO, ESTE 27 DE O...",Jorge Norberto GARI Hooker publicó un video en...,2019-10-26 13:46:03,,https://video.fbog15-1.fna.fbcdn.net/v/t42.904...,https://scontent.fbog15-1.fna.fbcdn.net/v/t15....,621360511602438.0,10,...,131938101487176.0,2019-09-26 12:04:00,105672677447052.0,Jorge Norberto GARI Hooker,https://facebook.com/story.php?story_fbid=1319...,True,[],void_error,void_error,void_error
1,141574783856841,"DE LA MANO DE DIOS Y CON TU VOTO, MAÑANA 27 DE...","DE LA MANO DE DIOS Y CON TU VOTO, MAÑANA 27 DE...",Jorge Norberto GARI Hooker\n2 de octubre de 20...,2019-10-26 13:03:49,,https://video.fbog15-1.fna.fbcdn.net/v/t42.904...,https://scontent.fbog15-1.fna.fbcdn.net/v/t15....,2316548485261855.0,34,...,133770721303914.0,2019-10-02 10:00:00,105672677447052.0,Jorge Norberto GARI Hooker,https://facebook.com/story.php?story_fbid=1337...,True,[],void_error,void_error,void_error
2,141573717190281,Este Domingo 27 de Octubre de 2019 VOTEMOS UNI...,Este Domingo 27 de Octubre de 2019 VOTEMOS UNI...,Jorge Norberto GARI Hooker publicó un video en...,2019-10-26 12:59:18,,https://video.fbog15-1.fna.fbcdn.net/v/t42.904...,https://scontent.fbog15-1.fna.fbcdn.net/v/t15....,355758055329426.0,10,...,123995182281468.0,2019-09-02 19:32:00,105672677447052.0,Jorge Norberto GARI Hooker,https://facebook.com/story.php?story_fbid=1239...,True,[],void_error,void_error,void_error
3,133770721303914,"""Perseverance and perspective until victory"" w...","""Perseverance and perspective until victory"" w...",,2019-10-02 10:00:53,,https://video.fbog15-1.fna.fbcdn.net/v/t42.904...,https://scontent.fbog15-1.fna.fbcdn.net/v/t15....,2316548485261855.0,55,...,,,,,,True,[],void_error,void_error,void_error
4,132639424750377,PROGRAMACIÓN EVENTO DE LANZAMIENTO,PROGRAMACIÓN EVENTO DE LANZAMIENTO,,2019-09-28 16:35:43,,,,,19,...,,,,,,True,[],void_error,void_error,void_error


In [10]:
#Lets check the last 5 rows
posts_df.tail()

Unnamed: 0,post_id,text,post_text,shared_text,time,image,video,video_thumbnail,video_id,likes,...,shared_post_id,shared_time,shared_user_id,shared_username,shared_post_url,available,images,reactions,w3_fb_url,fetched_time
136,106055397408780,BUENOS DÍAS MI PUEBLO DE PROVIDENCIA Y SANTA C...,BUENOS DÍAS MI PUEBLO DE PROVIDENCIA Y SANTA C...,,2019-08-02 07:39:06,,,,,12,...,,,,,,True,[],void_error,void_error,void_error
137,105733674107619,Contamos Contigo\nWe are counting on you\n#Dec...,Contamos Contigo\nWe are counting on you\n#Dec...,,2019-08-02 01:03:58,,,,,4,...,,,,,,True,[],void_error,void_error,void_error
138,105676960779957,Jorge Norberto GARI Hooker actualizó su número...,,Jorge Norberto GARI Hooker actualizó su número...,2019-08-01 23:15:33,,,,,3,...,,,,,,True,[],void_error,void_error,void_error
139,105674237446896,,,,2019-08-01 23:12:20,,,,,1,...,,,,,,True,[],void_error,void_error,void_error
140,105673264113660,,,,2019-08-01 23:11:07,,,,,7,...,,,,,,True,[],void_error,void_error,void_error


In [11]:
#Lets save the posts dataset for each profile
posts_df.to_csv('./bases/popular_posts/garialcalde2023_fb_posts_06_10_2021.csv')