# Project

In [1]:
#  imports and set magics
import numpy as np
import pandas as pd
import tqdm
import time
import datetime
import os
import requests
from bs4 import BeautifulSoup
import re
import pyarrow

## Web scraping 

### Define functions:

In [2]:
def log(response: requests.Response):
    """
    Creates or appends a log-file with information from a requests.get()-call.
    
    The information gathered is:
    - - - - - - - -
        timestamp   :   Current local time.
        status_code :   Status code from requests call.
        length      :   Length of the HTML-string.
        output_path :   Current working directory.
        url         :   The URL of the response.
    """

    # Open or create the csv file
    if os.path.isfile('log'):
        log = open('log','a')
    else: 
        log = open('log','w')
        header = ['timestamp', 'status_code', 'length', 'output_file', 'url'] # Header names
        log.write(';'.join(header) + "\n")
        
    # Gather log information
    status_code = response.status_code # Status code from the request result
    timestamp = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # Local time
    length = len(response.text) # Length of the HTML-string
    output_path = os.getcwd() # Output path
    url = response.url # URL-string
    
    # Open the log file and append the gathered log information
    with open('log','a') as log:
        log.write(f'{timestamp};{status_code};{length};{output_path};{url}' + "\n") 


def create_url(page: int) -> str:
    """
    Creates a PolitiFact URL with the given pagenumber.

    Input:
    - - - - - - - -
    page (int) :    Pagenumber for the PolitiFact website.

    Returns:
    - - - - - - - -
    url (str)  :    URL of the PolitiFact website for given page. 
    """

    url = f'https://www.politifact.com/factchecks/list/?page={page}' # Construct url with f-string

    return url


def get_soup(url: str, header: dict) -> BeautifulSoup:
    """
    Constructs a HTML-string from a request of the given URL. 
    Requests are logged, see log(). 

    Input:
    - - - - - - - - 
    url (str)     :    URL of the website to receive the HTML-string from. \n
    header (dict) :    Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - - 
    soup (BeautifulSoup) :  HTML-string in the class of BeutifulSoup with 'lxml' parser.
    """

    response = requests.get(url, headers=header) # Request
    log(response) # Log 
    soup = BeautifulSoup(response.content, 'lxml') # Convert to response to HTML

    return soup


def extract_articles(soup: BeautifulSoup) -> list:
    """
    Extracts articles from HTML-string from the PolitiFact website.

    Input:
    - - - - - - - -
    soup (BeautifulSoup) : HTML-string from the PolitiFact website.

    Returns:
    - - - - - - - - 
    list_of_articles (list) : A list of all articles in the given soup. \n
                              Each element is an article of data structure as BeautifulSoup.
    """
    
    articles = soup.find(class_='o-listicle__list') # Find section with articles
    list_of_articles = articles.find_all('li') # Find all articles as a list

    return list_of_articles


def extract_info(article: BeautifulSoup) -> list:
    """
    Extracts all relevant information from an article on the PolitiFact website.

    Input:
    - - - - - - - - 
    article (BeautifulSoup) :  Article to extract data from, see extract_articles().

    Returns:
    - - - - - - - - 
    [name_txt, name_href, description_txt, quote_txt, quote_href, meter, footer] (list) \n 
    The name and URL of the quoted person, the description of the quote, the quote itself \n
    and link hereof, the truthfulness index, and information on the article in string-format.
    """

    # Statement name 
    name = article.find(class_='m-statement__name')
    name_txt = name.text # name 
    name_href = name['href'] # href

    # Statement description
    description_txt = article.find(class_='m-statement__desc').text

    # Statement quote
    quote = article.find(class_='m-statement__quote').a
    quote_txt = quote.text # name 
    quote_href = quote['href'] # href

    # Statement meter
    meter = article.find(class_='m-statement__meter').div.img['alt']

    # Statement footer
    footer = article.find(class_='m-statement__footer').text

    return [name_txt, name_href, description_txt, quote_txt, quote_href, meter, footer]


def data_politifact(startpage: int, endpage: int, header: dict) -> list:
    """
    Compound function that scrapes an interval of pages from PolitiFact and extracts information for analysis. \n
    Saves extracted information for each page in '/data'-folder as CSV, and logs requests in 'log'. 

    Input:
    - - - - - - - -
    startpage (int) :  The first page to scrape. \n
    endpage   (int) :  The last page to scrape. \n
    header    (dict):  Dictionary to send in the query string for the request.

    Returns:
    - - - - - - - -
    list_of_dfs (list) : A list of pandas.DataFrame containing the extracted information from each page.
    """

    list_of_dfs = [] # initialize empty list for dataframes

    # Loop through pages and track progress with tqdm
    for page in tqdm.tqdm(range(startpage, endpage+1)):
        url = create_url(page) # create url

        try: # circumvent problem with empty pages
            soup = get_soup(url, header) # construct html
            articles = extract_articles(soup) # extract articles 

            output = [] # initialize empty for articles 

            # Loop through articles 
            for article in articles:
                info = extract_info(article) # extract relevant information
                output.append(info) # append output

        except: # skip page
            continue

        # Create DataFrame
        output_df = pd.DataFrame(output, columns=['name_txt', 'name_href', 'description_txt', 'quote_txt', 'quote_href', 'meter', 'footer'])

        # Create data-folder if it doesn't exist
        path = os.getcwd() + '/page_data/'
        if not os.path.exists(path):
            os.makedirs(path)

        # Save CSV-file and append list of DataFrames
        output_df.to_parquet(path + f'data_p{page}.pq') # save parquet file (this keeps datatypes)
        list_of_dfs.append(output_df) # append df

        
        time.sleep(0.5) # sleep for 0.5 sec 

    return list_of_dfs


def get_article_data(article: BeautifulSoup) -> list:
    """
    A function that scrapes each individual article for relevant data. \n

    Input:
    - - - - - - - -
    article (BeautifulSoup) : BeatifulSoup element of article. \n
    
    Returns:
    - - - - - - - -
    [tags, sub_header, text_body, quote_href] : A list of of all relevant data from each politifact article.

    """

    # Extract tags
    tag_soup = article.find(class_='m-list m-list--horizontal')\
        .find_all('a') #Find all tags
    
    list_of_tags = [] #Create empty list for tags 
    
    for tag in tag_soup:
        list_of_tags.append(tag['title']) #Append each tag to list of tags

    # Extract sub-header
    sub_header = article.find(class_='c-title c-title--subline').text #conclusion by journalist

    # Extract entire text body
    text_block = article.find(class_='m-textblock') #Find article's body text
    text_body = []

    for paragraph in text_block.find_all('p'): #Find all paragraphs in article
        text_body.append(paragraph.text) #append them to list

    text_body=' '.join(text_body) #Convert to a single string

    source_block = article.find(class_='m-superbox__content')\
                    .find_all('p') #Find article's source block and paragraphs

    source_body = []
    source_link = []

    for paragraph in source_block:
        source_body.append(paragraph.text) #Find text in source paragraph and append

    for paragraph in source_block:
        try:
            source_link.append(paragraph.a['href']) #append link if it's there
        except:
            continue
        source_link.append('No link') #Append 'no link' if there's no url. 


    sources = [x for x in zip(source_body, source_link)] #Store  sources in list of tuples (the sources 'text' and, if applicable, the link itself)

    return [list_of_tags, sub_header, text_body, sources]


def get_all_articles(list_of_url: list, header: dict) -> pd.DataFrame:
    """
    A compound function that scrapes relevant data from each article on politifact.com and stores this in a DataFrame. \n

    Input:
    - - - - - - - -
    list_of_url (list) : A list of URL's for each article to scrape. \n
    header      (dict) : Dictionary to send in the query string for the request.
    
    Returns:
    - - - - - - - -
    list_of_dfs (list) : A list of of dataframe for each article.

    """

    # Split list_of_url into chunks of 30 URLs
    chunked_list = []
    for i in range(0, len(list_of_url), 30):
        chunked_list.append(list_of_url[i:i+30])

    # Create data-folder if it doesn't exist
    path = os.getcwd() + '/article_data/'
    if not os.path.exists(path):
        os.makedirs(path)

    list_of_dfs = [] # Initialize empty list for dataframes

    for it, chunk in tqdm.tqdm(enumerate(chunked_list)):
        output = [] # Initialize empty output list 

        # Loop through list of URls
        for article_url in chunk: 
            full_url = 'https://www.politifact.com' + article_url
            article = get_soup(full_url, header=header) #Get BeautifulSoup element for each article
            try:    
                article_data = get_article_data(article) #Extract data from article
            except: 
                continue
            article_data.append(article_url) #Quote_href
            output.append(article_data) #Append data to output list
            time.sleep(0.5) #Sleep for 0.5 seconds

        # Create DataFrame
        output_df = pd.DataFrame(data=output, columns=['tags', 'sub_header', 'text_body', 'sources', 'quote_href']) #Convert to DataFrame
        
        # Save pq-file and append list of DataFrames
        name_it = it
        filename = f'article_data_{name_it}.pq'
        while os.path.exists(path + filename):
            name_it += 1
            filename = f'article_data_{name_it}.pq'
        print(filename)
        output_df.to_parquet(path + filename)
        list_of_dfs.append(output_df)

    return list_of_dfs 

### Scrape all pages:

The code will do *one* of the following three:
1. Load full dataset if data has been downloaded and concatenated.
2. Load data from data folder if data has been downloaded. (Note: You will be asked to delete the data folder or dowload missing files manually, if all data has not already been downloaded and saved in the folder.)
3. Download all data. **NB!** Takes ~30 minutes.

When the code has been run, the dataset `data_full`, will contain raw data with summary information for all articles on [PolitiFact](https://www.politifact.com).

In [3]:
header = {  'name_1':'Marius Heltberg Lassen'   ,'email_1':'pgb206@alumni.ku.dk', 
            'name_2':'Jørgen Baun Høst'         ,'email_2':'pjz633@alumni.ku.dk',
            'intention':'Train supervised ML model for academic purposes' } # state names and (non-commerical/academic) intentions for data scraping

path = os.getcwd()
if os.path.exists('page_data_merged.pq'): 
    page_data_merged = pd.read_parquet('page_data_merged.pq')
elif os.path.exists('page_data'):
    assert len(os.listdir('page_data')) >= 722, "Delete folder 'page_data', or download missing files manually"
    dfs = []
    for file in os.listdir('page_data'):
        dfs.append(pd.read_parquet('page_data/' + file))
    page_data_merged = pd.concat(dfs)
    page_data_merged.to_parquet('page_data_merged.pq')
else: 
    dfs = data_politifact(1, 728, header)
    page_data_merged = pd.concat(dfs)
    page_data_merged.to_parquet('page_data_merged.pq')

In [4]:
path = os.getcwd()
if os.path.exists('article_data_merged.pq'):
    article_data_merged = pd.read_parquet('article_data_merged.pq')
elif os.path.exists('article_data'):
    assert len(os.listdir('article_data')) >= 722, "Delete folder 'article_data', or download missing files manually"
    dfs = [] 
    for file in os.listdir('article_data'):
        dfs.append(pd.read_parquet('article_data/' + file))
    article_data_merged = pd.concat(dfs)
    article_data_merged.to_parquet('article_data_merged.pq')
else: 
    dfs = get_all_articles(page_data_merged['quote_href'], header)
    article_data_merged = pd.concat(dfs)
    article_data_merged.to_parquet('article_data_merged.pq')

In [5]:
data_complete = pd.merge(page_data_merged, article_data_merged, how='left', on='quote_href')
data_complete.to_parquet('data_complete.pq')

## Data Structuring

In [6]:
def remove_newline(document):
    document = re.sub('\n', '', document)
    return document 

def description_date(document):
    document = re.sub('stated on ', '', document)
    document = re.findall(r'[\w]* [\d]+, \d\d\d\d', document)[0]
    document = datetime.datetime.strptime(document, '%B %d, %Y')
    return document

def description_forum(document):
    document = re.sub(r'stated on [\w]* [\d]+, \d\d\d\d in ', '', document)
    document = re.sub('\.:', '', document)
    document = re.sub(':', '', document)
    document = re.sub(r'^[a][n]* ', '', document)
    return document

def footer_split(document):
    document = re.sub('By ', '', document).split('•')
    return document 

In [7]:
def cleaner(data: pd.DataFrame) -> pd.DataFrame:
    # Make copy and set index
    df = data.copy()
    df.rename(columns={'quote_href': 'URL'}, inplace=True)
    df.set_index('URL', inplace=True)

    # Remove '\n'
    df['name_txt'] = df['name_txt'].apply(remove_newline)
    df['description_txt'] = df['description_txt'].apply(remove_newline)
    df['quote_txt'] = df['quote_txt'].apply(remove_newline)
    df['footer'] = df['footer'].apply(remove_newline)
    df['sub_header'] = df['sub_header'].apply(remove_newline)

    # Extract description info
    df['description_date'] = df['description_txt'].apply(description_date)
    df['description_forum'] = df['description_txt'].apply(description_forum)
    df.drop('description_txt', axis=1, inplace=True)

    # Extract footer info
    df['footer_split'] = df['footer'].apply(footer_split)
    df['footer_name'] = [x[0].strip() for x in df['footer_split'] ]
    df['footer_date'] = [datetime.datetime.strptime(x[1].strip(), '%B %d, %Y') for x in df['footer_split']]
    df.drop(['footer_split', 'footer'], axis=1, inplace=True)
    
    # Drop sources, and order and rename columns 
    df.rename(columns={'description_date'   : 'Date', 
                       'name_txt'           : 'Name',
                       'name_href'          : 'Name URL',
                       'description_forum'  : 'Forum',
                       'quote_txt'          : 'Quote',
                       'sub_header'         : 'Conclusion',
                       'text_body'          : 'Article',
                       'meter'              : 'Meter',
                       'footer_date'        : 'Article date',
                       'footer_name'        : 'Author'},
              inplace=True)
    df = df[['Date', 'Name', 'Name URL', 'Forum', 'Quote', 'Conclusion', 'Article', 'Meter', 'Article date', 'Author', 'tags']]

    # Extract and sort tags
    tag_values = pd.Series([x for list in df['tags'] for x in list]).value_counts()
    few_tag_obs = tag_values[tag_values <= 50].index.values
    many_tag_obs = tag_values[tag_values > 50].index.values
    for tag in many_tag_obs:
        df[f'Tag: {tag}'] = df['tags'].map(lambda x: tag in x)
    df['Tag: Other'] = df['tags'].map(lambda x: any(tag in x for tag in few_tag_obs))
    df.drop('tags', axis=1, inplace=True)

    return df

In [8]:
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

data_clean = cleaner(data_complete)

In [9]:
data_clean.to_parquet('data_clean.pq')

## Descriptive Statistics

In [10]:
data_clean = pd.read_parquet('data_clean.pq')

## Analysis

In [11]:
test = data_clean['Article'][1].encode('ascii', 'ignore')
test.decode()

'A video being shared on social media adds what may seem like a worrisome layer to the monkeypox health emergency in the United States: that the virus is being put in the water. "Monkey pox in the water," someone can be heard saying while recording a news broadcast at a water reclamation facility in Fulton County, where Atlanta is the county seat. "ATL, oh man, they put something else in the water." This post was flagged as part of Facebooks efforts to combat false news and misinformation on its News Feed. (Read more about our partnership with Facebook.) The July 26 news broadcast in the post reported on scientists testing wastewater for COVID-19 and the monkeypox virus to better gauge infection rates in the area. Some people have interpreted the video to mean that theres monkeypox in the Atlanta areas drinking water, and as multiple fact-checkers have noted, thats wrong. Monkeypox was detected in the wastewater there, but that doesnt mean someone put it there, or intentionally tampere

In [12]:
import nltk

In [13]:
def prepare(document):

    # Lowercase and remove non-alphanumeric characters
    document = document.lower()
    document = re.sub(r'[^\w\s]', '', document)

    # Tokenize 
    document_tokens = nltk.tokenize.word_tokenize(document)

    # Delete stop-words
    document_nostop = [i for i in document_tokens if i not in nltk.corpus.stopwords.words('english')]

    # Lemmatize 
    document_lemmatized = [nltk.WordNetLemmatizer().lemmatize(i) for i in document_nostop]
    
    # Covert from list back to string 
    document = ' '.join(document_lemmatized)
    
    return document 

In [14]:
data_clean['Quote clean'] = data_clean['Quote'].apply(prepare)

In [15]:
def partition(df, true, fake): 
    
    part = df[df['Meter'].isin(true + fake)]
    part['Fake'] = part['Meter'].isin(fake).astype(int)

    return part

In [41]:
data_clean['Meter'].unique()

array(['pants-fire', 'false', 'half-flip', 'half-true', 'barely-true',
       'mostly-true', 'true', 'full-flop', 'no-flip'], dtype=object)

'true' > 'mostly-true' > 'half-true' > 'barely-true' > 'false' > 'pants-fire'

In [None]:
(['true'], ['pants-fire']),
(['true'], ['pants-fire', 'false'])
(['true'], ['pants-fire', 'false', 'barely-true'])


In [42]:
test = partition(data_clean, ['true', 'mostle-true'], ['pants-fire', 'false'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part['Fake'] = part['Meter'].isin(fake).astype(int)


In [43]:
X_train = test['Quote clean'].values
y_train = test['Fake'].values

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_bag = tfidf.fit_transform(X_train)

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=0) #Text classifier
lr.fit(X_train_bag,y_train)
train_preds = lr.predict(X_train_bag)

features = ['_'.join(s.split()) for s in tfidf.get_feature_names_out()]
coefficients = lr.coef_
coefs_df = pd.DataFrame.from_records(coefficients, columns=features)

In [45]:
coefs_df.T.nlargest(10, 0)

Unnamed: 0,0
biden,3.32958
show,2.875544
vaccine,2.745449
covid19,2.672034
photo,2.229911
joe,2.156875
say,2.050232
obamacare,2.023995
coronavirus,2.015709
covid,1.870258


In [46]:
 np.mean([(train_preds==y_train)])

0.8277303289115028

In [None]:


from sklearn.model_selection import KFold 
kfolds = Kfold(n_split=10)


In [65]:
from sklearn.feature_extraction.text import TfidfVectorizer
X_lem_values = X['ready'].values
X_bag = TfidfVectorizer().fit_transform(X_lem_values)