In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import string
import json
import time
import random

In [2]:
def get_html_page(page_url):
    res = requests.get(page_url)
    return BeautifulSoup(res.content, "html.parser")

## 1. Gather all the issues

In [3]:
issues_url = "https://www.politifact.com/issues/"
issues_soup = get_html_page(issues_url)

issues_tables = issues_soup.find_all('ul', class_='m-list m-list--columns m-list--breezy')

all_issues = []
for table in issues_tables:
    table_issues = table.find_all('li', class_='m-list__item')
    for issue in table_issues:
        all_issues.append(issue.text.strip().lower())

In [4]:
len(all_issues)

154

## 2. Scrape data from each page of every issue (if any)

In [5]:
curr_data = []
for issue in all_issues:

    curr_page_num = 1
    
    while True:
        curr_url = f"https://www.politifact.com/factchecks/list/?page={curr_page_num}&category={issue}"

        soup = get_html_page(curr_url)
        try:
            curr_page = soup.find('ul', class_='o-listicle__list').find_all('li')
        except:
            #print(f'{issue}: no results found on page {curr_page_num}')
            #print('-------------------------------------------------------')
            time.sleep(5)
            break

        anyContent = len(curr_page)
        #print(f'{issue}: page {curr_page_num} has {anyContent} listings.')

        for listing in curr_page:
            name = listing.find('a', class_='m-statement__name').text.strip()
            claim_details = listing.find('div', class_='m-statement__desc').text
            claim_date = claim_details.split('on ')[1].split(' in')[0]
            try:
                claim_source = claim_details.split(' in ')[1].translate(str.maketrans('', '', string.punctuation)).strip() # remove punctuation
            except:
                claim_source = 'NA' # e.g. 'stated on [date]:'
            claim = listing.find('div', class_='m-statement__quote').text.strip()
            accuracy_rating = listing.find('img', class_='c-image__original', alt= True).get('alt')
            # fact_check_details = listing.find('footer', class_='m-statement__footer').text.strip().split(' • ')
            # try:
            #     fact_checker = fact_check_details[0].split('By ')[1]
            # except:
            #     fact_checker = 'NA'
            # fact_check_date = fact_check_details[1]
            # source_url = 'https://www.politifact.com' + listing.find('div', class_='m-statement__quote').a.get('href') 

            row = {
                "name": name,
                "claim_date": claim_date,
                "claim_source": claim_source,
                "claim": claim,
                'issue': issue,
                "accuracy_rating": accuracy_rating,
                # "fact checker": fact_checker,
                # "fact check ruling date": fact_check_date,
                # "url": source_url,
                # "page_num": curr_page_num,
            }

            curr_data.append(row)

        curr_page_num+=1

        time.sleep(random.randint(5,10)) # script will be stopped for 5-10 seconds

KeyboardInterrupt: 

## 3. Convert data to Pandas dataframe

In [6]:
curr_data_df = pd.DataFrame(curr_data)
curr_data_df.head()

Unnamed: 0,name,claim_date,claim_source,claim,issue,accuracy rating
0,Yesli Vega,"October 20, 2022",an interview,They were “never my comments” that a woman can...,abortion,half-true
1,Kris Mayes,"September 25, 2022",a tweet,"Abraham Hamadeh says he will “lock up doctors,...",abortion,half-true
2,Tony Evers,"October 19, 2022",Campaign ad,If a 12-year-old girl became pregnant because ...,abortion,mostly-true
3,National Republican Senatorial Committee,"October 6, 2022",News release,“Mandela Barnes came out in favor of abortion ...,abortion,barely-true
4,Katie Hobbs,"October 9, 2022",an interview on CBS News Face the Nation,Kari Lake has “gone on the record saying she s...,abortion,mostly-true


In [None]:
curr_data_df.shape

(26202, 10)

## 4. Remove Invalid Accuracy Ratings

In [None]:
# drop all rows with invalid accuracy rating values 
drop_values = ['full-flop', 'half-flip', 'no-flip']
curr_data_df = curr_data_df[~curr_data_df['accuracy rating'].isin(drop_values)]

## 5. Save data to csv file

In [None]:
curr_data_df.to_csv('politifact_data.csv', index=False, encoding = 'utf-8-sig')