# Crawling fact check data
Source: PolitiFact

## Load libraries

In [87]:
from bs4 import BeautifulSoup
import requests
from time import sleep 

import spacy
import pandas as pd

## Check html structure of a website

In [88]:
url = 'https://www.politifact.com/factchecks/list/?page=2&category=elections&ruling=true'
    
response = requests.get(url)

soup = BeautifulSoup(response.text, "html5lib")
page = soup.find_all('li', class_='o-listicle__item')

print(page[0])

<li class="o-listicle__item">
<article class="m-statement m-statement--is-medium m-statement--true">
<div class="m-statement__author">
<div class="m-statement__avatar">
<div class="m-statement__image">
<div class="c-image" style="padding-top: 119.27710843373494%;">
<img class="c-image__thumb" height="99" src="https://static.politifact.com/CACHE/images/politifact/mugs/Screen_Shot_2019-04-23_at_2.53.39_PM/3214d9831a1ebeb94994638553905ac6.jpg" width="83"/>
<picture>
<img class="c-image__original" height="178" src="https://static.politifact.com/CACHE/images/politifact/mugs/Screen_Shot_2019-04-23_at_2.53.39_PM/e0db4deb19a51268fcdde8871510580a.jpg" width="166"/>
</picture>
</div>
</div>
</div>
<div class="m-statement__meta">
<a class="m-statement__name" href="/personalities/mac-heller/" title="Mac Heller">
Mac Heller
</a>
<div class="m-statement__desc">
stated on April 7, 2019 in an interview on CNN:
</div>
</div>
</div>
<div class="m-statement__content">
<div class="m-statement__body">
<div

In [57]:
# Test one post
post = page[0]

name = post.find('a', class_='m-statement__name').get_text().strip()
date = post.find('div', class_='m-statement__desc').get_text().strip()
text = post.find('div', class_='m-statement__quote').get_text().strip()

print(name)
print(date)
print(text)

Mac Heller
stated on April 7, 2019 in an interview on CNN:
"The 2008 election was the first election in which voters of color comprised over 25 percent of the electorate, and that number is going up."


## Crawl all pages

In [84]:
# For all pages
category = ['elections','taxes','environment','immigration','health-check','coronavirus','foreign-policy']
ruling=['true','mostly-true','half-true','barely-true','false','pants-fire']

names = []
dates = []
texts = []
topics = []
labels = []

nlp = spacy.load("en_core_web_sm")

for cat in category:
    for rul in ruling:
        print(cat,rul)
        
        for i in range(1,50):                
            #sleep(1)
            
            url = 'https://www.politifact.com/factchecks/list/?page={0}&category={1}&ruling={2}'.format(i,cat,rul)

            html = requests.get(url).text
            soup = BeautifulSoup(html, "html5lib")
            page = soup.find_all('li', class_='o-listicle__item')
            
            if len(page)==0:
                print('break at page',i)
                break
            else:
                for post in page:

                    name = post.find('a', class_='m-statement__name').get_text().strip()
                    date = post.find('div', class_='m-statement__desc').get_text().strip()
                    text = post.find('div', class_='m-statement__quote').get_text().strip()

                    doc = nlp(date)
                    date = [ent.text for ent in doc.ents if ent.label_ =='DATE'][0]
                    label = rul
                    topic = cat

                    names.append(name)
                    dates.append(date)
                    texts.append(text)
                    topics.append(topic)
                    labels.append(label)

elections true
break at page 8
elections mostly-true
break at page 8
elections half-true
break at page 8
elections barely-true
break at page 8
elections false
break at page 18
elections pants-fire
break at page 10
taxes true
break at page 8
taxes mostly-true
break at page 12
taxes half-true
break at page 13
taxes barely-true
break at page 12
taxes false
break at page 12
taxes pants-fire
break at page 5
environment true
break at page 4
environment mostly-true
break at page 5
environment half-true
break at page 5
environment barely-true
break at page 5
environment false
break at page 6
environment pants-fire
break at page 3
immigration true
break at page 5
immigration mostly-true
break at page 8
immigration half-true
break at page 9
immigration barely-true
break at page 9
immigration false
break at page 12
immigration pants-fire
break at page 5
health-check true
break at page 2
health-check mostly-true
break at page 2
health-check half-true
break at page 2
health-check barely-true
break 

## Generate a dataframe

In [85]:
df = pd.DataFrame({
    'name':names,
    'date':dates,
    'text':texts,
    'topic':topics,
    'label':labels
})

display(df)

Unnamed: 0,name,date,text,topic,label
0,Joni Ernst,"January 23, 2022","In Iowa, “since we have put a number of the vo...",elections,true
1,Chris Larson,"September 10, 2021","The GOP-controlled Legislature has ""refused to...",elections,true
2,Chip Roy,"July 29, 2021",A quorum-breaking Texas Democrat said in 2007 ...,elections,true
3,Robert Martwick,"May 10, 2021",Opposition to having a fully elected Chicago B...,elections,true
4,Facebook posts,"March 2, 2021",The Georgia House passed a bill that “makes it...,elections,true
...,...,...,...,...,...
7174,Republican Party of Florida,"May 28, 2008","""Fidel Castro endorses Obama.""",foreign-policy,pants-fire
7175,Bill Clinton,"April 10, 2008","""Hillary (Clinton), one time late at night whe...",foreign-policy,pants-fire
7176,Hillary Clinton,"March 17, 2008","""I remember landing under sniper fire.""",foreign-policy,pants-fire
7177,Hillary Clinton,"February 26, 2008","Obama ""basically threatened to bomb Pakistan.""",foreign-policy,pants-fire


## Save a dataset as a csv format

In [86]:
df.to_csv('data/politifact.csv')