In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import string
import json
import time
import random

In [None]:
def get_html_page(page_url):
    res = requests.get(page_url)
    return BeautifulSoup(res.content, "html.parser")

## 1. Gather all the issues

In [None]:
issues_url = "https://www.politifact.com/issues/"
issues_soup = get_html_page(issues_url)

issues_tables = issues_soup.find_all('ul', class_='m-list m-list--columns m-list--breezy')

all_issues = []
for table in issues_tables:
    table_issues = table.find_all('li', class_='m-list__item')
    for issue in table_issues:
        all_issues.append(issue.text.strip().lower())

In [None]:
len(all_issues)

154

## 2. Scrape data from each page of every issue (if any)

In [None]:
curr_data = []
for issue in all_issues:

    curr_page_num = 1
    
    while True:
        curr_url = f"https://www.politifact.com/factchecks/list/?page={curr_page_num}&category={issue}"

        soup = get_html_page(curr_url)
        try:
            curr_page = soup.find('ul', class_='o-listicle__list').find_all('li')
        except:
            #print(f'{issue}: no results found on page {curr_page_num}')
            #print('-------------------------------------------------------')
            time.sleep(5)
            break

        anyContent = len(curr_page)
        #print(f'{issue}: page {curr_page_num} has {anyContent} listings.')

        for listing in curr_page:
            name = listing.find('a', class_='m-statement__name').text.strip()
            claim_details = listing.find('div', class_='m-statement__desc').text
            claim_date = claim_details.split('on ')[1].split(' in')[0]
            try:
                claim_source = claim_details.split(' in ')[1].translate(str.maketrans('', '', string.punctuation)).strip() # remove punctuation
            except:
                claim_source = 'NA' # e.g. 'stated on [date]:'
            claim = listing.find('div', class_='m-statement__quote').text.strip()
            accuracy_rating = listing.find('img', class_='c-image__original', alt= True).get('alt')
            # fact_check_details = listing.find('footer', class_='m-statement__footer').text.strip().split(' • ')
            # try:
            #     fact_checker = fact_check_details[0].split('By ')[1]
            # except:
            #     fact_checker = 'NA'
            # fact_check_date = fact_check_details[1]
            # source_url = 'https://www.politifact.com' + listing.find('div', class_='m-statement__quote').a.get('href') 

            row = {
                "name": name,
                "claim_date": claim_date,
                "claim_source": claim_source,
                "claim": claim,
                'issue': issue,
                "accuracy rating": accuracy_rating,
                # "fact checker": fact_checker,
                # "fact check ruling date": fact_check_date,
                # "url": source_url,
                # "page_num": curr_page_num,
            }

            curr_data.append(row)

        curr_page_num+=1

        time.sleep(random.randint(5,10)) # script will be stopped for 5-10 seconds

## 3. Convert data to Pandas dataframe

In [None]:
curr_data_df = pd.DataFrame(curr_data)
curr_data_df.head

<bound method NDFrame.head of                      name       claim date  \
0            CatholicVote  August 17, 2022   
1              Mark Kelly   August 5, 2022   
2           Stacey Abrams   August 4, 2022   
3           Beto O'Rourke    June 27, 2022   
4      Karine Jean-Pierre   August 3, 2022   
...                   ...              ...   
26200        Barack Obama     May 27, 2009   
26201       Arlen Specter   March 24, 2009   
26202      Michelle Obama  August 25, 2008   
26203      Michelle Obama  August 25, 2008   
26204        Barack Obama   April 18, 2008   

                                            claim source  \
0                                                  an ad   
1                                                  an ad   
2                                                  an ad   
3                                      an Instagram post   
4                                 a White House briefing   
...                                                  ... 

In [None]:
curr_data_df.shape

(26202, 10)

## 4. Remove Invalid Accuracy Ratings

In [None]:
# drop all rows with invalid accuracy rating values 
drop_values = ['full-flop', 'half-flip', 'no-flip']
curr_data_df = curr_data_df[~curr_data_df['accuracy rating'].isin(drop_values)]

## 5. Save data to csv file

In [None]:
curr_data_df.to_csv('politifact_data.csv', index=False, encoding = 'utf-8-sig')