# Basic setup for fetching data from the facebook api

### Remember to add an active access token below and adjust the parameters and fields if necessary

In [None]:
# must be specified to be able to use methods of package
import sys
sys.path.append('../src') 
import numpy as np
import pandas as pd
from political_ads.api_request import API_request
from political_ads.preprocessor import Preprocessor

# Generates a dataset (in the data directory)
# requestor = API_request()
# requestor.generate_dataset(500, "Joe Biden", "EAAD3So8oorMBAIgWyz9birkjFZCRkyKZADF6qfnmkZC41UPKMzeoSWPXLXxNTBiFP9td55s5TZCOKfAoheAmRih0U7TjaCCDsDQwZAZCBPH2pGQtx6y9e9Keouk8JosvkuLPxs451MY3QGrzZAgiZAVKFJZBxZCIFxHIKHVJmn8uwnGsJaMKh1mQIknChEeYf16tYyZCZBehAo4VToZCuqW0P5KL2")


In [None]:
# Returns text file as dataframe with transformations
preprocess = Preprocessor()
data = preprocess.file_to_df("..\\data\\generated_dataset.txt")

In [None]:
data.head(10)

In [None]:
'''
Visualization attempts
https://pandas.pydata.org/pandas-docs/dev/getting_started/intro_tutorials/09_timeseries.html
'''
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting spending over time and impressions over time 
fig, axes = plt.subplots(2,1,figsize=(8,4))
data.plot(x="ad_creation_time", y="impressions", ax=axes[0])
data.plot(x="ad_creation_time", y="spend", ax=axes[1])

In [None]:
# Plot range of impressions
plt.fill_between(x=data["ad_creation_time"], y1=data["impressions_lo"], y2=data["impressions_hi"], alpha=1, color="green")
plt.plot(data["ad_creation_time"], data["impressions"])
plt.show()

In [None]:
'''
Amount spend by facebook page
'''
by_page = data.groupby("page_name").agg(
    # Aggregate no of ads
    no_ads = ('id', 'count'),
    # Aggregate sum of spend & total impressions generated
    spend_lo = ('spend_lo', 'sum'),
    spend_hi = ('spend_hi', 'sum'),
    impressions_lo = ('impressions_lo', 'sum'),
    impressions_hi = ('impressions_hi', 'sum'),
    # Average number of impressions & spend per ad
    avg_impressions = ('impressions', 'mean'),
    avg_spend = ('spend', 'mean')

).reset_index()

by_page

In [None]:
# Display range of spend

# Maybe normalize it
plt.fill_between(x=by_page["page_name"], y1=by_page["spend_lo"], y2=by_page["spend_hi"], alpha=1, color="green")
plt.plot(by_page["page_name"], by_page["spend"])
plt.show()

In [None]:
'''
Congress member filtering
'''
# Get page-ids:
# https://commentpicker.com/find-facebook-id.php


congress_members = pd.DataFrame(pd.read_csv("..\\src\\data_sets\\legislators-current.csv"))

In [None]:
congress_members["facebook"]

In [None]:
congress_members["facebook"].isnull().sum()

In [None]:
# Scraper experiement
# https://github.com/kevinzg/facebook-scraper

from facebook_scraper import get_page_info

page = get_page_info(account="SenatorBobCasey")

page

In [None]:
import time

members_fbNames_test = [] # list of tuples

for value in congress_members["facebook"].items():
    if not pd.isnull(value[1]):
        page = get_page_info(account=value[1])
        print(page)
        if "name" and "identifier" in page:
            members_fbNames_test.append((value[1], page["name"], page["identifier"]))
            print(page["name"] + " id:" + str(page["identifier"]))
    time.sleep(0.5)

In [None]:
members_names = pd.DataFrame(members_fbNames_test, columns=["facebook", "page_name", "identifier"])
# Save data as csv
members_names.to_csv("..\\src\\data_sets\\legislators_page_ids.csv", index=False, header=True)

In [None]:
members_names_cp = members_names.copy()

In [None]:
congress_members_fb = congress_members[["first_name", "last_name", "full_name", "type", "state", "district", "party", "facebook"]]

merged = congress_members_fb.merge(right=members_names, on="facebook", how="left")

In [None]:
merged["identifier"] = merged["identifier"].fillna(0).astype(np.int64)

In [None]:
merged["identifier"] = merged["identifier"].astype(np.int64)

In [None]:
merged.to_csv("..\\src\\data_sets\\legislators_fb_info.csv", index=False, header=True)

In [None]:
congress = pd.read_csv("..\\src\\data_sets\\legislators_fb_info.csv")

In [None]:
congress["full_name"].isna().sum()

In [None]:
report = pd.read_csv("..\\data\\FacebookAdLibraryReport_2021-10-15_US_lifelong_advertisers.csv")

In [None]:
len(report[report["Page Name"].str.contains("Kpoadjioasdjqwodjas", na=False)]) == 0

In [None]:
def get_page_id(data: pd.DataFrame, name: str):
    match = data[data["Page Name"].str.contains(name, na=False)]
    if len(match) != 0:
        return match.iloc[0][0]
    else:
        return "no match"


In [None]:
get_page_id(report, "Bernie Sanders")

In [None]:
congress["page_id"] = congress.apply(lambda x: get_page_id(report,x["full_name"]),axis=1)

In [None]:
congress[congress["page_id"] == "no match"].count()

congress.to_csv("..\\src\\data_sets\\legislators_fb_info.csv", index=False, header=True)


In [None]:
import pandas as pd
congress = pd.read_csv("..\\src\\data_sets\\legislators_fb_info.csv")

In [None]:
congress

In [None]:
# must be specified to be able to use methods of package
import sys
sys.path.append('../src') 
import numpy as np
import pandas as pd
from political_ads.api_request import API_request
from political_ads.preprocessor import Preprocessor

# Generates a dataset (in the data directory)
test_ids = [6266829799, 9351652533, 2220944231249057, 101501768597429, 876319055750124, 512954815727434]
requestor = API_request()
requestor.generate_dataset_by_pageId(500, test_ids, "EAAD3So8oorMBAPcQZCsrdT0p2lsvUuyLQozbZCnvnJnmZCeswClj2dXakZCMkPZB8B0m3qf2Ynojj31VzBZBZBd31KuUhNr1ukt9tilFGyFNhbZB4ak36zvz8LOH165dAXvmZCqbtjjvgZAXb4PbJ0ICuVpPrb9y9MqZBvk7EswvSvBCS4LXTZBEFowcoNYXZBZBuf7Ll09kPzLBsAOvktd3EepEMqsMOryyE2ZBvcudXeXB7ZCoFgeLSlDoHbNpsikZASAFriewZD")


In [None]:
import json

# read json file
with open('..\\data\\dataset_by_pageId.txt') as f:
    json_data = json.load(f)

# load existing file
# file =  open("..\\data\\dataset_by_pageId.txt")
# json_ = json.loads(file)

json_data_1 = json_data 

json_data.extend(json_data_1)
# json_.extend(json_data) # add string to file
jsonFile = open("..\\data\\dataset_by_pageId.txt", "w") # filepath and name specified here!
final_str = json.dumps(json_data) 
jsonFile.write(final_str)
jsonFile.close()


In [None]:
 # load existing file
with open('..\\data\\dataset_by_pageId.txt') as f:
    existing_file = json.load(f)
existing_file.extend("bablabl") # add string to file
jsonFile = open("..\\data\\dataset_by_pageId_appended.txt", "w") # filepath and name specified here!
final_file_str = json.dumps(existing_file)
jsonFile.write(final_file_str)
jsonFile.close()

In [None]:
# must be specified to be able to use methods of package
import sys
sys.path.append('../src') 
import numpy as np
import pandas as pd
from political_ads.api_request import API_request
from political_ads.preprocessor import Preprocessor

preprocess = Preprocessor()

data = preprocess.file_to_df("..\\data\\dataset_by_pageId.txt")

data.shape

In [None]:
congress = pd.read_csv("..\\src\\data_sets\\legislators_fb_info.csv")
congress.head(10)

In [None]:
# test
# trying to make continuous loop to fetch all data
import sys
sys.path.append('../src') 
import numpy as np
import pandas as pd
from political_ads.api_request import API_request
from political_ads.preprocessor import Preprocessor

requestor = API_request()

count = 0
while count < 20:
    query = []
    for i in range(10):
        if count < len(congress):
            query.append(congress.loc[count]["page_id"])
            count +=1
    # clean query / remove "no match"
    clean_query = []
    for i in range(len(query)):
        if query[i] != "no match":
            clean_query.append(query[i])
    print(clean_query)
    
    requestor.append_dataset_by_pageId(500, clean_query, "EAAD3So8oorMBAL9qzc2ZBrrDaNqfJkAbHy6KPqZCqNmhOAvAzEIauoJOqruWt4f9oIiw4YsgX7Qs5l4y291PQBery6ZBBDOnKvxV4lGx6ZCOZChxMdCOvz9GNGstwdgtGUakHzmNYbldZBjNZBFyMjb9ZACYVygAcFToyMkK5cfp4s9C4mvjIBD1OBkvuZBWAB4IRcZB4XXEZBiVWo2ZAJWIfNIJnocHmW5EZB9ZAIkT60Ddbwf0IlFmwUWpFNH4YGVx3uESAZD")
        

In [18]:
congress.loc[0]

first_name                Sherrod
last_name                   Brown
full_name           Sherrod Brown
type                          sen
state                          OH
district                      NaN
party                    Democrat
facebook      SenatorSherrodBrown
page_name                     NaN
identifier                      0
page_id                6266829799
Name: 0, dtype: object

In [21]:
# test
# fetch all ads by one page and concatenate them to final file
import sys
sys.path.append('../src') 
import numpy as np
import pandas as pd
from political_ads.api_request import API_request
from political_ads.preprocessor import Preprocessor
import json


requestor = API_request()

count = 0
final_file = []
while count < len(congress):
    politician_name = congress.loc[count]["full_name"]
    page_id = congress.loc[count]["page_id"]
    print(f"Try politician {politician_name}")
    if page_id != "no match":
        print(f"Page ID exists!")
        final_file.extend(requestor.dataset_by_pageId_asString(500, [page_id], "EAAD3So8oorMBAL9qzc2ZBrrDaNqfJkAbHy6KPqZCqNmhOAvAzEIauoJOqruWt4f9oIiw4YsgX7Qs5l4y291PQBery6ZBBDOnKvxV4lGx6ZCOZChxMdCOvz9GNGstwdgtGUakHzmNYbldZBjNZBFyMjb9ZACYVygAcFToyMkK5cfp4s9C4mvjIBD1OBkvuZBWAB4IRcZB4XXEZBiVWo2ZAJWIfNIJnocHmW5EZB9ZAIkT60Ddbwf0IlFmwUWpFNH4YGVx3uESAZD"))
    count += 1

jsonFile = open("..\\data\\dataset_by_pageId_appended.txt", "w") # filepath and name specified here!
        
final_file_str = json.dumps(final_file)
jsonFile.write(final_file_str)
jsonFile.close()


SyntaxError: f-string: unmatched '[' (Temp/ipykernel_7068/1672595199.py, line 19)

In [None]:


        # # load existing file
        # with open('..\\data\\dataset_by_pageId_appended.txt') as f:
        #     existing_file = json.load(f)
# existing_file.extend(final_response) # add string to file
import json

jsonFile = open("..\\data\\dataset_by_pageId_appended.txt", "w") # filepath and name specified here!
        
final_file_str = json.dumps(final_file)
jsonFile.write(final_file_str)
jsonFile.close()

In [None]:
# must be specified to be able to use methods of package
import sys
sys.path.append('../src') 
import numpy as np
import pandas as pd
from political_ads.api_request import API_request
from political_ads.preprocessor import Preprocessor

preprocess = Preprocessor()

data = preprocess.file_to_df("..\\data\\dataset_by_pageId_appended.txt")

In [None]:
data.groupby(["page_name"]).agg("count")