# Index
1. ### [Get list of URLs to parse](#URLlist)
1. ### [Content scraping](#content)
1. ### [Saving to CSV file](#export)

<a id='URLlist'></a>

## Get list of URLs to parse

In [3]:
import ast
with open('vanguard_fc_links.txt', 'r') as f:
    file_contents = f.read()
    fc_links = ast.literal_eval(file_contents)


Category: meta-programme, 145
Category: blog, 10
Category: report, 141
Category: spotcheck, 20
Classified URLS: 316
Total URLs retrieved: 328
Some of the URLs retrieved were lost, but you can proceed.


<a id='content'></a>

## Content scraping
#### Title, Article, Author, Date, URL

In [4]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime
from langdetect import detect

meta_export_list = []

def is_english(text):
    try:
        lang = detect(text)
        if lang == 'en':
            return True
        else:
            return False
    except:
        return False
    
def replace_blanks_and_linebreaks(s):
    # Replace '\xa0' with a space
    s = s.replace('\xa0', ' ')
    # Replace '\n' with a space
    s = s.replace('\n', ' ')
    # Replace '\xa0\n' with a space
    s = s.replace('\xa0\n', ' ')
    return s

url = "https://www.vanguardngr.com/2023/03/2023-polls-how-hurricane-obi-changed-political-landscape-in-nigeria-fayose/"
result = requests.get(url)
src = result.content     
soup = BeautifulSoup(src, 'html.parser')

claims, veredicts, justifications = [], [], []

# extracting date of claims
date_raw = soup.find('span', class_= 'published').text
date = (re.findall(r'Published\son\s(\w+\s\w+\s\w+)', str(date_raw)))
date_obj = datetime.strptime(date[0], '%d %B %Y')
formatted_date = date_obj.strftime('%d/%m/%Y')
dates = [str(formatted_date)]



label = soup.find_all('img')[2].get('alt').split('_')[1]
label = soup.find_all('img')[2].get('src').split('_')[-1].split('.png')[0]
veredicts.append(str(label))

in_short = soup.find('div', class_='clearfix text-formatted field field--name-field-report-summary field--type-text-long field--label-hidden field__item').text.split('IN SHORT: ')[1].strip()
if is_english(str(in_short)):
claims.append(in_short)

# extracting justifying text
justifying_text = soup.find('div', class_= 'clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item').text.strip()
justifying_text = replace_blanks_and_linebreaks(justifying_text)
justifications.append(justifying_text)

everything = list(zip(claims, veredicts, justifications, dates, urls))
for item in everything:
meta_export_list.append(item)

except:
#print('Something is wrong', url)
pass

print(len(meta_export_list))

68


<a id='report'></a>

## Reports scraping 

In [15]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime

report_export_list = []

def is_english(text):
    try:
        lang = detect(text)
        if lang == 'en':
            return True
        else:
            return False
    except:
        return False

def replace_blanks_and_linebreaks(s):
    # Replace '\xa0' with a space
    s = s.replace('\xa0', ' ')
    # Replace '\n' with a space
    s = s.replace('\n', ' ')
    # Replace '\xa0\n' with a space
    s = s.replace('\xa0\n', ' ')
    return s

for report_link in url_groups['report']:
    try:
        url = report_link
        result = requests.get(url)
        src = result.content     
        soup = BeautifulSoup(src, 'html.parser')
        input_string = str(soup)
        soup = BeautifulSoup(input_string, 'html.parser')

        claims = []
        veredicts = []
        justifications = []

        # number of claims in document
        num_claims = input_string.count('report-claim')

        # extracting date of claims
        date_raw = soup.find('span', class_= 'published').text
        date = (re.findall(r'Published\son\s(\w+\s\w+\s\w+)', str(date_raw)))
        date_obj = datetime.strptime(date[0], '%d %B %Y')
        formatted_date = date_obj.strftime('%d/%m/%Y')
        dates = [str(formatted_date)]*num_claims

        # extracting veredicts
        veredicts_raw = soup.find_all('div', class_ = 'report-verdict indicator')
        for ver in veredicts_raw:
            for j in ver.find('span'):
                veredicts.append(j)

        # extracting urls
        urls = [str(url)]*num_claims

        for i in range(num_claims):
            claim = soup.find('div', class_='report-claim').find('p', class_='claim-content').text.strip()[1:-1]
            if is_english(str(claim)):
                input_string = input_string[input_string.index(str(claim))+len(claim):].strip()
                soup = BeautifulSoup(input_string, 'html.parser')
                justifying_text = soup.find('div', class_='clearfix text-formatted field field--name-field-wysiwyg field--type-text-long field--label-hidden field__item').text.strip()
                justifying_text = replace_blanks_and_linebreaks(justifying_text)
                claims.append(claim)
                justifications.append(justifying_text)
            else:
                pass

        everything = list(zip(claims, veredicts, justifications, dates, urls))
        for item in everything:
            report_export_list.append(item)    
    except:
        pass

print(len(report_export_list))

219


<a id='export'></a>

## Saving to CSV file

In [133]:
res_list = meta_export_list + report_export_list

In [136]:
import pandas as pd
df = pd.DataFrame.from_records(res_list, columns=["Claim", "Label", "Evidence", "Date", "URL"])
pd.options.display.max_rows = None
df

Unnamed: 0,Claim,Label,Evidence,Date,URL
0,A video is being shared on social media with t...,False,A video shared on Facebook claims that a house...,06/04/2023,https://africacheck.org//fact-checks/meta-prog...
1,"Despite persistent rumours on social media, th...",False,A claim is circulating on Facebook in Nigeria ...,31/03/2023,https://africacheck.org//fact-checks/meta-prog...
2,The result of Nigeria's presidential election ...,False,Nigerians hate themselves but yet find it diff...,30/03/2023,https://africacheck.org//fact-checks/meta-prog...
3,An open letter addressed to Bola Tinubu is doi...,False,A post shared on Facebook claims that Adeyinka...,30/03/2023,https://africacheck.org//fact-checks/meta-prog...
4,As the courts examine Bola Tinubu’s electoral ...,False,Nigeria president-elect Bola Tinubu has said t...,28/03/2023,https://africacheck.org//fact-checks/meta-prog...
5,The election in this northeastern state in Nig...,/sites/default/files/inline-images/Binanichecked,Nigeria held elections for governors and state...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
6,A Nigerian-American professor of history winni...,/sites/default/files/inline-images/Saheefalse,Screenshots circulating on Facebook in Nigeria...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
7,Did a top Nigerian government official really ...,False,“I am now 100% against Binani becoming the Gov...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
8,"As Nigerians vote for governors, claims have b...",/sites/default/files/inline-images/Oworufalse,"A photo of Olumide Oworu, the Labour Party can...",18/03/2023,https://africacheck.org//fact-checks/meta-prog...
9,Just days before elections for governor across...,False,A message circulating on WhatsApp and Facebook...,17/03/2023,https://africacheck.org//fact-checks/meta-prog...


In [137]:
set(df['Label'])

{'/sites/default/files/inline-images/Binanichecked',
 '/sites/default/files/inline-images/FasholaChecked',
 '/sites/default/files/inline-images/Oworufalse',
 '/sites/default/files/inline-images/Saheefalse',
 '/sites/default/files/inline-images/Tinubufake',
 '/sites/default/files/inline-images/falseCBN',
 '/sites/default/files/pictures/2022-05/Image%20from%20iOS%20%286%29.jpg',
 '/sites/default/files/users/103.jpg',
 'Correct',
 'Downplayed',
 'Exaggerated',
 'Fake',
 'False',
 'Inccorect',
 'Incorrect',
 'MContext',
 'Misleading',
 'Mostly Correct',
 'Mostly correct',
 'PFalse',
 'Scam',
 'Understated',
 'Unproven',
 'Unrpoven'}

In [138]:
# handling undetected labels
label_list = ['Correct', 'Downplayed', 'Exaggerated', 'Fake', 'False', 'Incorrect', 'MContext', 'Misleading', 'Mostly correct', 'PFalse', 'Scam', 'Understated', 'Unproven', 'Checked']
for i, label in enumerate(df['Label']):
    if label not in label_list:
        #print(i,label)
        if label == 'Mostly Correct':
            df.at[i, 'Label'] = 'Mostly correct'
        elif label == 'Inccorect':
            df.at[i, 'Label'] = 'Incorrect'
        elif label == 'Unrpoven':
            df.at[i, 'Label'] = 'Unproven'
        elif label == 'MContext':
            df.at[i, 'Label'] = 'Missing context'
        elif label == 'PFalse':
            df.at[i, 'Label'] = 'Partly false'
        else: # example: /sites/default/files/inline-images/Binanichecked
            unfinished = label.split('/')[-1]
            unfinished_lower = unfinished.lower()
            for label in label_list:
                label_lower = label.lower()
                if label_lower in unfinished_lower:
                    df.at[i, 'Label'] = label
                #else:
                #    df = df.drop([df.index[i]])

In [140]:
set(df['Label'])

{'/sites/default/files/pictures/2022-05/Image%20from%20iOS%20%286%29.jpg',
 '/sites/default/files/users/103.jpg',
 'Checked',
 'Correct',
 'Downplayed',
 'Exaggerated',
 'Fake',
 'False',
 'Incorrect',
 'MContext',
 'Misleading',
 'Mostly correct',
 'PFalse',
 'Scam',
 'Understated',
 'Unproven'}

In [141]:
label_list = ['Checked','Correct','Downplayed','Exaggerated','Fake','False','Incorrect','MContext','Misleading','Mostly correct','PFalse','Scam','Understated', 'Unproven']
for i, label in enumerate(df['Label']):
    if label not in label_list:
        df = df.drop([df.index[i]])

In [143]:
df

Unnamed: 0,Claim,Label,Evidence,Date,URL
0,A video is being shared on social media with t...,False,A video shared on Facebook claims that a house...,06/04/2023,https://africacheck.org//fact-checks/meta-prog...
1,"Despite persistent rumours on social media, th...",False,A claim is circulating on Facebook in Nigeria ...,31/03/2023,https://africacheck.org//fact-checks/meta-prog...
2,The result of Nigeria's presidential election ...,False,Nigerians hate themselves but yet find it diff...,30/03/2023,https://africacheck.org//fact-checks/meta-prog...
3,An open letter addressed to Bola Tinubu is doi...,False,A post shared on Facebook claims that Adeyinka...,30/03/2023,https://africacheck.org//fact-checks/meta-prog...
4,As the courts examine Bola Tinubu’s electoral ...,False,Nigeria president-elect Bola Tinubu has said t...,28/03/2023,https://africacheck.org//fact-checks/meta-prog...
5,The election in this northeastern state in Nig...,Checked,Nigeria held elections for governors and state...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
6,A Nigerian-American professor of history winni...,False,Screenshots circulating on Facebook in Nigeria...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
7,Did a top Nigerian government official really ...,False,“I am now 100% against Binani becoming the Gov...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
8,"As Nigerians vote for governors, claims have b...",False,"A photo of Olumide Oworu, the Labour Party can...",18/03/2023,https://africacheck.org//fact-checks/meta-prog...
9,Just days before elections for governor across...,False,A message circulating on WhatsApp and Facebook...,17/03/2023,https://africacheck.org//fact-checks/meta-prog...


In [144]:
# check a second time, just in case
for i, label in enumerate(df['Label']):
    if label not in label_list:
        df = df.drop([df.index[i]])

In [145]:
df

Unnamed: 0,Claim,Label,Evidence,Date,URL
0,A video is being shared on social media with t...,False,A video shared on Facebook claims that a house...,06/04/2023,https://africacheck.org//fact-checks/meta-prog...
1,"Despite persistent rumours on social media, th...",False,A claim is circulating on Facebook in Nigeria ...,31/03/2023,https://africacheck.org//fact-checks/meta-prog...
2,The result of Nigeria's presidential election ...,False,Nigerians hate themselves but yet find it diff...,30/03/2023,https://africacheck.org//fact-checks/meta-prog...
3,An open letter addressed to Bola Tinubu is doi...,False,A post shared on Facebook claims that Adeyinka...,30/03/2023,https://africacheck.org//fact-checks/meta-prog...
4,As the courts examine Bola Tinubu’s electoral ...,False,Nigeria president-elect Bola Tinubu has said t...,28/03/2023,https://africacheck.org//fact-checks/meta-prog...
5,The election in this northeastern state in Nig...,Checked,Nigeria held elections for governors and state...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
6,A Nigerian-American professor of history winni...,False,Screenshots circulating on Facebook in Nigeria...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
7,Did a top Nigerian government official really ...,False,“I am now 100% against Binani becoming the Gov...,24/03/2023,https://africacheck.org//fact-checks/meta-prog...
8,"As Nigerians vote for governors, claims have b...",False,"A photo of Olumide Oworu, the Labour Party can...",18/03/2023,https://africacheck.org//fact-checks/meta-prog...
9,Just days before elections for governor across...,False,A message circulating on WhatsApp and Facebook...,17/03/2023,https://africacheck.org//fact-checks/meta-prog...


In [147]:
# drop rows with empty claims
df.dropna(subset=['Claim'], inplace=True)

In [150]:
# save to csv file
df.to_csv("africa_check.csv", index=False)