In [278]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver


In [227]:
def make_url_list(first_url, num_pages=20):
    urls = [first_url]
    
    base_url = first_url[0:26]
    
    for i in range(2,num_pages):
        modified_url = base_url + 'page/' + str(i)
        urls.append(modified_url)
        
    return(urls)

In [243]:
def get_claims(url):
    '''
    URL taken from factcheck.afp.com
    
    This just extracts the url, headline and date from the list
    Could actually navigate to extracted URL and get more info
    
    Returns df with url, headline, date, time
    '''
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    urls = list()
    base_url = 'https://checkyourfact.com'

    headlines = list()

    dates = list()

    feedpost = soup.find('atom')
    feats = feedpost.find('features')
    
    if feats: 
        cards = feats.find_all("a")
    
        for card in cards:
            url = card['href']
            urls.append(base_url + url)
            hl = card.find("span").get_text()
            headlines.append(hl)
            dt = card['href'][1:11]
            dates.append(dt) # needs refactoring
        
    article_cards = feedpost.find('articles').find_all("a")
    
    for card in article_cards:
        url = card['href']
        urls.append(base_url + url)
        hl = card.find("name").get_text()
        headlines.append(hl)
        dt = card['href'][1:11]
        dates.append(dt)

    df = pd.DataFrame(list(zip(urls, headlines, dates)), 
                       columns =['url', 'headline', 'date']) 

    df['date'] = df['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%Y/%m/%d').strftime('%d-%m-%Y'))

    return(df)

In [244]:
first_url = 'https://checkyourfact.com/?fbclid=IwAR1WS2XZjpejAlLm34KrHtYvT0fy9ep5FFbTVLSREAPtxGVPLrxHXRPkm0k'

urls = make_url_list(first_url, num_pages=25)

all_data = pd.DataFrame()

for url in urls:
    print(url)
    page_data = get_claims(url)
    all_data = pd.concat([all_data, page_data])
    
all_data.to_csv('/Users/madelinecampbell/Documents/GitHub/projects/misinformation_2021/data/checkyourfact.csv')


https://checkyourfact.com/?fbclid=IwAR1WS2XZjpejAlLm34KrHtYvT0fy9ep5FFbTVLSREAPtxGVPLrxHXRPkm0k
https://checkyourfact.com/page/2
https://checkyourfact.com/page/3
https://checkyourfact.com/page/4
https://checkyourfact.com/page/5
https://checkyourfact.com/page/6
https://checkyourfact.com/page/7
https://checkyourfact.com/page/8
https://checkyourfact.com/page/9
https://checkyourfact.com/page/10
https://checkyourfact.com/page/11
https://checkyourfact.com/page/12
https://checkyourfact.com/page/13
https://checkyourfact.com/page/14
https://checkyourfact.com/page/15
https://checkyourfact.com/page/16
https://checkyourfact.com/page/17
https://checkyourfact.com/page/18
https://checkyourfact.com/page/19
https://checkyourfact.com/page/20
https://checkyourfact.com/page/21
https://checkyourfact.com/page/22
https://checkyourfact.com/page/23
https://checkyourfact.com/page/24


In [230]:
urls


['https://checkyourfact.com/?fbclid=IwAR1WS2XZjpejAlLm34KrHtYvT0fy9ep5FFbTVLSREAPtxGVPLrxHXRPkm0k',
 'https://checkyourfact.com/page/2',
 'https://checkyourfact.com/page/3',
 'https://checkyourfact.com/page/4',
 'https://checkyourfact.com/page/5',
 'https://checkyourfact.com/page/6',
 'https://checkyourfact.com/page/7',
 'https://checkyourfact.com/page/8',
 'https://checkyourfact.com/page/9',
 'https://checkyourfact.com/page/10',
 'https://checkyourfact.com/page/11',
 'https://checkyourfact.com/page/12',
 'https://checkyourfact.com/page/13',
 'https://checkyourfact.com/page/14',
 'https://checkyourfact.com/page/15',
 'https://checkyourfact.com/page/16',
 'https://checkyourfact.com/page/17',
 'https://checkyourfact.com/page/18',
 'https://checkyourfact.com/page/19',
 'https://checkyourfact.com/page/20',
 'https://checkyourfact.com/page/21',
 'https://checkyourfact.com/page/22',
 'https://checkyourfact.com/page/23',
 'https://checkyourfact.com/page/24']

In [280]:
url = 'https://factcheck.thedispatch.com/archive'

# This starts an instance of the Chrome browser to be controlled
# by the selenium webdriver
driver = webdriver.Chrome()
 
# This is simply the driver getting the page
driver.get(url)

# try-except acts as a loop that on each iteration is asking if more exists
while True:
    try:
        loadmore = driver.find_element_by_id("bottomPager")
        loadmore.click()
    except NoSuchElementException:
        print("Reached bottom of page")
        break

soup = BeautifulSoup(driver.page_source,'html.parser')

# page = requests.get(url)

# soup = BeautifulSoup(page.content, 'html.parser')

urls = list()

headlines = list()
reviews = list()

dates = list()

feedpost = soup.find('div', attrs={'class': 'portable-archive-list'})
cards = feedpost.find_all('div', {'class':'post-preview portable-archive-post has-image has-author-line'})
for card in cards:
    url = card.find("a")['href']
    urls.append(url)
#     hl = card.find("a").get_text()
#     headlines.append(hl)
    rev = card.find_all("a")[1].get_text()
    reviews.append(rev)
    
    page2 = requests.get(url)
    soup2 = BeautifulSoup(page2.content, 'html.parser')
    title = soup2.find("h1").get_text()
    headlines.append(title)
    dt = soup2.find("table").find('td', {'class': 'post-meta-item post-date'}).get_text()
    dates.append(dt) # needs refactoring

df = pd.DataFrame(list(zip(urls, headlines, dates, claims)), 
                   columns =['url', 'headline', 'review', 'date']) 

#df['date'] = df['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%Y/%m/%d').strftime('%d-%m-%Y'))


WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


In [275]:
df

Unnamed: 0,url,headline,review,date
0,https://factcheck.thedispatch.com/p/did-the-wa...,Did the Washington Post Correct a Report About...,13 hr ago,CLAIM: Video of President Joe Biden was digita...
1,https://factcheck.thedispatch.com/p/do-60-perc...,Do 60 Percent of Republicans Support the $1.9 ...,Mar 12,"THE CLAIM: On Tuesday, hours before police say..."
2,https://factcheck.thedispatch.com/p/did-a-cali...,Did a California Mayor Pro Tempore Plead Guilt...,Mar 11,As Congress considers a sweeping bid to overha...
3,https://factcheck.thedispatch.com/p/did-oprah-...,Did Oprah Winfrey ‘Pimp’ Young Girls to Convic...,Mar 11,"CLAIM: COVID-19 vaccines contain aluminum, a t..."
4,https://factcheck.thedispatch.com/p/is-the-eff...,Is the Effort to Recall Gavin Newsom About ‘Te...,Mar 9,CLAIM: Columbia University is holding separate...
5,https://factcheck.thedispatch.com/p/did-navy-s...,Did Navy SEALs Loyal to President Trump Arrest...,Mar 9,"CLAIM: In all of 2020, there were only 9,000 i..."
6,https://factcheck.thedispatch.com/p/was-the-cp...,Was the CPAC Stage Designed to Look Like Nazi ...,Mar 9,CLAIM: Photos showing gasoline listed at more ...
7,https://factcheck.thedispatch.com/p/did-evange...,Did Evangelical Leaders Pray Over the Golden T...,Mar 2,CLAIM: President Joe Biden is not screening im...
8,https://factcheck.thedispatch.com/p/did-merric...,Did Merrick Garland Deny Antifa Attacks Were D...,Mar 1,CLAIM: If you receive the COVID-19 vaccine and...
9,https://factcheck.thedispatch.com/p/did-joe-bi...,Did Joe Biden Revoke the Keystone Pipeline Per...,Mar 1,CLAIM: An image shows the first photograph eve...


In [254]:
url = 'https://factcheck.thedispatch.com/archive'

page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

urls = list()
# base_url = 'https://checkyourfact.com'

headlines = list()

dates = list()

feedpost = soup.find('div', attrs={'class': 'portable-archive-list'})
cards = feedpost.find_all('div', {'class':'post-preview portable-archive-post has-image has-author-line'})

In [262]:
#feedpost

cards[0].find("a")['href']

'https://factcheck.thedispatch.com/p/did-the-washington-post-correct-a'

In [263]:
url = 'https://factcheck.thedispatch.com/p/did-the-washington-post-correct-a'
page2 = requests.get(url)
soup2 = BeautifulSoup(page2.content, 'html.parser')
title = soup2.find("h1").get_text()
dt = soup2.find("table").find('td', {'class': 'post-meta-item post-date'}).get_text()

'Did the Washington Post Correct a Report About Trump’s Call With the Georgia Secretary of State?'

In [272]:
soup2.find("table").find('td', {'class': 'post-meta-item post-date'}).get_text()

'13 hr ago'

In [240]:
# feedpost

# feedpost.find('features').find_all("a")[0]

feedpost = soup.find('atom')
cards = feedpost.find('articles').find_all("a")
cards[0].find("name").get_text()


# feedpost.find('articles')



In [213]:
dt_obj = cards[0]['href'][1:11] #[1].get_text() #.find("title").get_text()

datetime.strptime(dt_obj, '%Y/%m/%d').strftime('%d-%m-%Y')

'17-03-2021'

In [218]:
url[0:26]

'https://checkyourfact.com/'

In [176]:
df = pd.DataFrame(list(zip(urls, headlines, dates, claims)), 
                   columns =['url', 'headline', 'date', 'claim']) 

df.head()

Unnamed: 0,url,headline,date,claim
0,https://apnews.com/article/fact-checking-afs:C...,Video of Biden with reporters was not digitall...,"March 17, 2021 GMT",CLAIM: Video of President Joe Biden was digita...
1,https://apnews.com/article/fact-checking-afs:C...,Racist Facebook post attributed to Atlanta sho...,"March 17, 2021 GMT","THE CLAIM: On Tuesday, hours before police say..."
2,https://apnews.com/article/fact-checking-afs:C...,Misleading claims swirl around US election ove...,"March 17, 2021 GMT",As Congress considers a sweeping bid to overha...
3,https://apnews.com/article/fact-checking-afs:C...,US and EU COVID vaccines don’t contain aluminum,"March 16, 2021 GMT","CLAIM: COVID-19 vaccines contain aluminum, a t..."
4,https://apnews.com/article/fact-checking-afs:C...,"Columbia University holds a main graduation, a...","March 16, 2021 GMT",CLAIM: Columbia University is holding separate...


In [182]:
temp_dt = df.date[0]
temp_dt

'March 17, 2021 GMT'

In [187]:
dt_obj = datetime.strptime(temp_dt, '%B %d, %Y %Z')
dt_obj.strftime('%d-%m-%Y')

'17-03-2021'

In [190]:
df['date'] = df['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%B %d, %Y %Z').strftime('%d-%m-%Y'))



Unnamed: 0,url,headline,date,claim,date2
0,https://apnews.com/article/fact-checking-afs:C...,Video of Biden with reporters was not digitall...,"March 17, 2021 GMT",CLAIM: Video of President Joe Biden was digita...,17-03-2021
1,https://apnews.com/article/fact-checking-afs:C...,Racist Facebook post attributed to Atlanta sho...,"March 17, 2021 GMT","THE CLAIM: On Tuesday, hours before police say...",17-03-2021
2,https://apnews.com/article/fact-checking-afs:C...,Misleading claims swirl around US election ove...,"March 17, 2021 GMT",As Congress considers a sweeping bid to overha...,17-03-2021
3,https://apnews.com/article/fact-checking-afs:C...,US and EU COVID vaccines don’t contain aluminum,"March 16, 2021 GMT","CLAIM: COVID-19 vaccines contain aluminum, a t...",16-03-2021
4,https://apnews.com/article/fact-checking-afs:C...,"Columbia University holds a main graduation, a...","March 16, 2021 GMT",CLAIM: Columbia University is holding separate...,16-03-2021
