# Project

In [1]:
#  imports and set magics
import numpy as np
import pandas as pd
import tqdm
import time
import os
import requests
from bs4 import BeautifulSoup
import re

## Web scraping 

### Define functions:

In [2]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a requests.get()-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log'):
        log = open('log','a')
    else: 
        log = open('log','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 


def create_url(page: int) -> str:
    """
    Creates a PolitiFact URL with the given pagenumber.

    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the PolitiFact website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the PolitiFact website for given page. 
    """

    url = f'https://www.politifact.com/factchecks/list/?page={page}' # Construct url with f-string

    return url


def get_soup(url: str, header: dict) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see log(). 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    response = requests.get(url, headers=header) # Request
    log(response) # Log 
    soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML

    return soup


def extract_articles(soup: BeautifulSoup) -> list:
    """
    Extracts articles from HTML-string from the PolitiFact website.

    Input:
    - - - - - - - -
    soup (BeautifulSoup) : HTML-string from the PolitiFact website.

    Returns:
    - - - - - - - - 
    list_of_articles (list) : A list of all articles in the given soup. \n
                              Each element is an article of data structure as BeautifulSoup.
    """
    
    articles = soup.find(class_='o-listicle__list') # Find section with articles
    list_of_articles = articles.find_all('li') # Find all articles as a list

    return list_of_articles


def extract_info(article: BeautifulSoup) -> list:
    """
    Extracts all relevant information from an article on the PolitiFact website.

    Input:
    - - - - - - - - 
    article (BeautifulSoup) :  Article to extract data from, see extract_articles().

    Returns:
    - - - - - - - - 
    [name_txt, name_href, description_txt, quote_txt, quote_href, meter, footer] (list) \n 
    The name and URL of the quoted person, the description of the quote, the quote itself \n
    and link hereof, the truthfulness index, and information on the article in string-format.
    """

    # Statement name 
    name = article.find(class_='m-statement__name')
    name_txt = name.text # name 
    name_href = name['href'] # href

    # Statement description
    description_txt = article.find(class_='m-statement__desc').text

    # Statement quote
    quote = article.find(class_='m-statement__quote').a
    quote_txt = quote.text # name 
    quote_href = quote['href'] # href

    # Statement meter
    meter = article.find(class_='m-statement__meter').div.img['alt']

    # Statement footer
    footer = article.find(class_='m-statement__footer').text

    return [name_txt, name_href, description_txt, quote_txt, quote_href, meter, footer]


def data_politifact(startpage: int, endpage: int, header: dict) -> list:
    """
    Compound function that scrapes an interval of pages from PolitiFact and extracts information for analysis. \n
    Saves extracted information for each page in '/data'-folder as CSV, and logs requests in 'log'. 

    Input:
    - - - - - - - -
    startpage (int) :  The first page to scrape. \n
    endpage   (int) :  The last page to scrape. \n
    header    (dict):  Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - -
    list_of_dfs (list) : A list of pandas.DataFrame containing the extracted information from each page.
    """

    list_of_dfs = [] # initialize empty list for dataframes

    # Loop through pages and track progress with tqdm
    for page in tqdm.tqdm(range(startpage, endpage+1)):
        url = create_url(page) # create url

        try: # circumvent problem with empty pages
            soup = get_soup(url, header) # construct html
            articles = extract_articles(soup) # extract articles 

            output = [] # initialize empty for articles 

            # Loop through articles 
            for article in articles:
                info = extract_info(article) # extract relevant information
                output.append(info) # append output

        except: # skip page
            continue

        # Create DataFrame
        output_df = pd.DataFrame(output, columns=['name_txt', 'name_href', 'description_txt', 'quote_txt', 'quote_href', 'meter', 'footer'])

        # Create data-folder if it doesn't exist
        path = os.getcwd() + '/data/'
        if not os.path.exists(path):
            os.makedirs(path)

        # Save CSV-file and append list of DataFrames
        output_df.to_csv(path + f'data_p{page}', index=False) # save csv
        list_of_dfs.append(output_df) # append df

        

        time.sleep(0.5) # sleep for 0.5 sec 

    return list_of_dfs

### Scrape all pages:

Do *one* of the following three:
1. Download all data. **NB!** Takes ~30 minutes.
2. Load data from data folder if data has been downloaded.
3. Load full dataset if data has been downloaded and concatenated.

Option 1:

In [8]:
header = {  'name_1':'Marius Heltberg Lassen'   ,'email_1':'pgb206@alumni.ku.dk', 
            'name_2':'Jørgen Baun Høst'         ,'email_2':'pjz633@alumni.ku.dk',
            'intention':'Train supervised ML model for academic purposes' } # state names and (non-commerical/academic) intentions for data scraping
#dfs = data_politifact(1, 728, header)
data_full = pd.concat(dfs)
data_full.to_csv('data_full')

Option 2:

In [19]:
dfs = []
for file in os.listdir('data'):
    dfs.append(pd.read_csv('data/' + file))
data_full = pd.concat(dfs)
data_full.to_csv('data_full', index=False)

Option 3:

In [3]:
data_full = pd.read_csv('data_full')

## Data Structuring

In [4]:
data_full.head()

Unnamed: 0,name_txt,name_href,description_txt,quote_txt,quote_href,meter,footer
0,\nRobert Hurt\n,/personalities/robert-hurt/,"\nstated on April 16, 2015 in a statement.:\n","\nSays the estate tax, ""in many cases,"" forces...",/factchecks/2015/may/03/robert-hurt/hurt-amiss...,false,"\nBy Warren Fiske • May 3, 2015\n"
1,\nMarco Rubio\n,/personalities/marco-rubio/,"\nstated on April 13, 2015 in an interview on ...","\n""The Iranians are now saying that what we're...",/factchecks/2015/may/01/marco-rubio/iran-unite...,true,"\nBy Lauren Carroll • May 1, 2015\n"
2,\nCity of Atlanta\n,/personalities/city-atlanta/,"\nstated on August 8, 2014 in press release:\n",\nTyler Perry’s plan to turn a majority of the...,/factchecks/2015/may/01/city-atlanta/Studio-pl...,half-true,"\nBy Nancy Badertscher • May 1, 2015\n"
3,\nRepresent.us\n,/personalities/representus/,"\nstated on April 30, 2015 in a meme on social...","\n""The U.S. representatives that voted to keep...",/factchecks/2015/apr/30/representus/did-lawmak...,mostly-true,"\nBy Louis Jacobson • April 30, 2015\n"
4,\nSteve Crisafulli\n,/personalities/steve-crisafulli/,"\nstated on April 28, 2015 in an op-ed in the ...","\n""If we choose Obamacare expansion, 600,000 w...",/factchecks/2015/apr/30/steve-crisafulli/crisa...,mostly-true,"\nBy Joshua Gillin • April 30, 2015\n"


### Define article data extraction functions

In [23]:
def get_article_data(article: BeautifulSoup) -> list:
    """
    A function that scrapes each individual article for relevant data. \n

    Input:
    - - - - - - - -
    article (BeautifulSoup) : BeatifulSoup element of article. \n
    
    Returns:
    - - - - - - - -
    [tags,sub_header,text_body,sources, quote_href] : A list of of all relevant data from each politifact article.

    """
    
    quote_href = article.find('meta', property='og:url')['content']
    quote_href = quote_href.replace('https://www.politifact.com', '') #Extract the quote_href from the meta data

    tag_soup = article.find(class_='m-list m-list--horizontal')\
        .find_all('a') #Find all tags
    
    list_of_tags = [] #Create empty list for tags 
    
    for tag in tag_soup:
        list_of_tags.append(tag['title']) #Append each tag to list of tags

    sub_header = article.find(class_='c-title c-title--subline').text #conclusion by journalist

    text_block = article.find(class_='m-textblock') #Find article's body text
    text_body = []

    for paragraph in text_block.find_all('p'): #Find all paragraphs in article
        text_body.append(paragraph.text) #append them to list

    text_body=' '.join(text_body) #Convert to a single string

    source_block = article.find(class_='m-superbox__content')\
                    .find_all('p') #Find article's source block and paragraphs
    
    source_body = []
    source_link = []

    for paragraph in source_block:
        source_body.append(paragraph.text) #Find text in source paragraph and append

    for paragraph in source_block:
        try:
            source_link.append(paragraph.a['href']) #append link if it's there
        except:
            continue
        source_link.append('No link') #Append 'no link' if there's no url. 
                                        #Is this how we wanna do it??
            
    sources = list(zip(source_body,source_link))

    return [list_of_tags, sub_header,text_body,sources, quote_href]

In [20]:
def get_all_articles(list_of_url: list) -> pd.DataFrame:
    """
    A compound function that scrapes relevant data from each article on politifact.com and stores this in a DataFrame. \n

    Input:
    - - - - - - - -
    list_of_url (list) : A list of URL's for each article to scrape. \n
    
    Returns:
    - - - - - - - -
    list_of_dfs (list) : A list of of dataframe for each article.

    """

    output = []

    for article_url in tqdm.tqdm(list_of_url): 
        article = get_soup(article_url, header=header) #Get BeautifulSoup element for each article
        article_data = get_article_data(article) #Extract data from article
        output.append(article_data) #Append data to output list
    
        output_df = pd.DataFrame(data=output, columns=['tags', 'sub_header', 'text_body', 'sources', 'quote_href']) #Convert to DataFrame


    return output_df   

In [24]:
#Let's give it a spin for the first 10 articles
article_url_list = []
url_base = 'https://politifact.com'

for quote_href in data_full['quote_href']:
    article_url_list.append(url_base+quote_href)



100%|██████████| 10/10 [00:02<00:00,  3.58it/s]


In [None]:

chunked_list = list()
chunk_size = 500
for i in range(0, len(article_url_list), chunk_size):
    chunked_list.append(article_url_list[i:i+chunk_size])

chunked_list

In [None]:
test = get_all_articles(article_url_list[:10])

In [29]:
test['text_body'][0]

'U.S. Rep. Robert Hurt full-heartedly joined his Republican House colleagues recently in their near unanimous vote to end the estate tax. "The death tax causes serious problems for family farmers and small business owners who want to have their children and grandchildren continue their life’s work," Hurt, who represents the rural 5th Congressional District, said in a statement posted to his website on \xa0April 16. "In many cases, this excessive and duplicative tax forces them to sell the farm or business just to pay these punitive taxes, keeping them from passing down the culmination of their lifetime of work to the next generation." We wondered whether Hurt was accurate in saying that the tax, "in many cases," forces the heirs of family farms and small businesses to pull up stakes. Many Republicans -- including other members of Virginia’s congressional delegation and House Speaker John Boehner -- made similar statements after the voted repeal vote. Republicans voted 233-3 for the leg