In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
from operator import iconcat 
import pandas as pd
import numpy as np

In [2]:
def crawl_speaker(url):
    fail = []
    results = []
    
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'lxml')
    except:
        fail.append(url)
        
    try:
        name = soup.find(class_ = 'm-pageheader__title')
        name_str = name.get_text()
        
        name_wiki = 'https://en.wikipedia.org/wiki/' + name_str
        
        description = soup.find(class_ = 'm-pageheader__body')
        description_str = description.get_text().strip()
        
        score = soup.find_all(class_ = 'm-scorecard__checks')
        score_vals = [int(i.get_text().strip()[0]) for i in score]
        
    except:
        fail.append(url)
        
    results.append([name_str,
                    name_wiki,
                    description_str] + score_vals)
    
    return results, fail

In [3]:
def crawl_factchecker(url):
    fail = []
    results = []
    
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'lxml')
    except:
        fail.append(url)
        
    try:
        name = soup.find(class_ = 'm-pageheader__title')
        name_str = name.get_text()
    except:
        print(url)
        
    position = soup.find(class_ = 'label-2')
    position_str = position.get_text().strip()
    

    
    description_str = ''
        
    if soup.find(class_ = 'm-pageheader__body').find('p') != None:
         description = soup.find(class_ = 'm-pageheader__body').find('p')   
         
         description_str = description.get_text().strip()
    
    elif soup.find(class_ = 'm-pageheader__body').find('b') != None:
         description = soup.find(class_ = 'm-pageheader__body').find('b')
         
         description_str = description.get_text().strip()
            
    elif soup.find(class_ = 'secondary-bio text-left bio-6') != None:
         description = soup.find(class_ = 'secondary-bio text-left bio-6') 
         
         description_str = description.get_text().strip()
            
    
    if description_str == '':
         description = soup.find(class_ = 'm-pageheader__body')
        
         description_str = description.contents[-2].get_text().strip()      
                    

                
                
                
        

    

    results.append([name_str,
                    position_str,
                    description_str])

    
    return results, fail

In [4]:
df = pd.read_csv('politifact.csv', encoding = 'latin1')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,url,speaker,speaker_info,statement,rating,where_when,title,content,content_html,factchecker,factchecker_info,factcheck_date
0,0,https://www.politifact.com/factchecks/2022/mar...,Viral image,https://www.politifact.com/personalities/viral...,"CNN staged a fire in Edmonton, Canada, and cla...",https://static.politifact.com/politifact/rulin...,"stated on March 28, 2022 in a Facebook post:","No, this Edmonton jacket isnât proof CNN sta...",CNN anchor Don Lemon has been reporting from U...,"<article class=""m-textblock"">\n<p>CNN anchor D...",Ciara O'Rourke,https://www.politifact.com/staff/ciara-orourke/,"March 28, 2022"
1,1,https://www.politifact.com/factchecks/2022/mar...,Facebook posts,https://www.politifact.com/personalities/faceb...,The United Nations sent out an email instructi...,https://static.politifact.com/politifact/rulin...,"stated on March 17, 2022 in a Facebook post:",Claim that the UN told staff not to use âwar...,The United Nations is being criticized by some...,"<article class=""m-textblock"">\n<p>The United N...",Samantha Putterman,https://www.politifact.com/staff/samantha-putt...,"March 28, 2022"
2,2,https://www.politifact.com/factchecks/2022/mar...,YouTube videos,https://www.politifact.com/personalities/youtu...,A video shows Ukraine President Volodymyr Zele...,https://static.politifact.com/politifact/rulin...,"stated on March 19, 2022 in a YouTube video's ...",Does video show Zelenskyy singing âImagineâ...,A viral YouTube video watched more than 1 mill...,"<article class=""m-textblock"">\n<p>A viral YouT...",Bill McCarthy,https://www.politifact.com/staff/bill-mccarthy/,"March 28, 2022"
3,3,https://www.politifact.com/factchecks/2022/mar...,Eli Bremer,https://www.politifact.com/personalities/eli-b...,âIf you look at windmillsâ and the greenho...,https://static.politifact.com/politifact/rulin...,"stated on March 23, 2022 in an interview with ...",A GOP Senate candidateâs muddled claim about...,Eli Bremer is one of seven Colorado Republican...,"<article class=""m-textblock"">\n<p>Eli Bremer i...",Jon Greenberg,https://www.politifact.com/staff/jon-greenberg/,"March 28, 2022"
4,4,https://www.politifact.com/factchecks/2022/mar...,Viral image,https://www.politifact.com/personalities/viral...,Republicans ânominated the 1st Black woman t...,https://static.politifact.com/politifact/rulin...,"stated on March 25, 2022 in a Facebook post:","No, Janice Rogers Brown wasnât the first Bla...",News reports have described Judge Ketanji Brow...,"<article class=""m-textblock"">\n<p>News reports...",Ciara O'Rourke,https://www.politifact.com/staff/ciara-orourke/,"March 28, 2022"


In [6]:
speaker_list = list(set(df.speaker_info))

In [7]:
len(speaker_list)

4550

In [8]:
results = []
fails = []
for i in speaker_list:
    info = crawl_speaker(i)
    results.append(info[0])
    fails.append(info[1])

In [9]:
df = pd.DataFrame(reduce(iconcat, results), columns=['speaker', 
                                                         'wiki_link', 
                                                         'description', 
                                                         'true', 
                                                         'mostly_true', 
                                                         'half_true', 
                                                         'mostly_false', 
                                                         'false', 
                                                         'pants_on_fire'])

df.to_csv('speaker.csv')

In [10]:
len(df)

4550

In [11]:
df = pd.read_csv('politifact.csv', encoding = 'latin1')

In [12]:
factchecker_list = list(set(df.factchecker_info))

In [13]:
len(factchecker_list)

550

In [14]:
results = []
fails = []
for i in factchecker_list:
    info = crawl_factchecker(i)
    results.append(info[0])
    fails.append(info[1])

In [15]:
df = pd.DataFrame(reduce(iconcat, results), columns=['factchecker', 
                                                         'position', 
                                                         'description'])

df.to_csv('factchecker.csv')

In [16]:
len(df)

550

In [17]:
speaker = pd.read_csv('speaker.csv')

In [18]:
factchecker = pd.read_csv('factchecker.csv')

In [19]:
speaker.head()

Unnamed: 0.1,Unnamed: 0,speaker,wiki_link,description,true,mostly_true,half_true,mostly_false,false,pants_on_fire
0,0,Alex Padilla,https://en.wikipedia.org/wiki/Alex Padilla,Alex Padilla was appointed to fill a U.S. Sena...,0,1,0,0,0,0
1,1,Campaign for America's Future,https://en.wikipedia.org/wiki/ Campaign for Am...,The Campaign for America's Future bills itself...,0,0,0,0,0,1
2,2,Juan Williams,https://en.wikipedia.org/wiki/Juan Williams,Juan Williams is a Fox pundit.,1,1,3,2,0,0
3,3,Matt Tighe,https://en.wikipedia.org/wiki/Matt Tighe,Matt Tighe is the campaign manager for Democra...,0,0,0,1,0,0
4,4,Brent Barton,https://en.wikipedia.org/wiki/Brent Barton,Brent Barton is an attorney and former state r...,0,0,0,1,0,0


In [20]:
factchecker.head()

Unnamed: 0.1,Unnamed: 0,factchecker,position,description
0,0,Mylana Chico,Reporter,Mylana Chico is a reporter with PolitiFact Wes...
1,1,Jacob Carpenter,Reporter,Jacob Carpenter is a reporter at the Milwaukee...
2,2,Jaren Holmes,Reporter,Jaren English is a reporter with PolitiFact Ne...
3,3,Peter Lord,Reporter,Peter B. Lord has been a reporter at The Provi...
4,4,Therese Bottomly,Managing Editor,Therese Bottomly is a managing editor at The O...


In [21]:
speaker.isna().sum()

Unnamed: 0       0
speaker          0
wiki_link        0
description      8
true             0
mostly_true      0
half_true        0
mostly_false     0
false            0
pants_on_fire    0
dtype: int64

In [22]:
factchecker.isna().sum()

Unnamed: 0     0
factchecker    0
position       0
description    0
dtype: int64