In [265]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from datetime import datetime
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import time
import os

## Notes

- USA Today only goes as far back at Feb 19
- AP News goes back to Feb 12

In [266]:
def make_url_list(first_url, num_pages=20):

    urls = [first_url]
    base_url = first_url[:-13]
    end_url = first_url[-12:]

    for i in range(2,num_pages):
        modified_url = base_url + str(i) + end_url
        urls.append(modified_url)
        
    return(urls)


In [267]:
def get_claims(url):
    '''
    URL taken from factcheck.afp.com
    
    This just extracts the url, headline and date from the list
    Could actually navigate to extracted URL and get more info
    
    Returns df with url, headline, date, time
    '''
    page = requests.get(url)
    soup = BeautifulSoup(page.content,'html.parser')

    base_url = 'https://www.reuters.com/'
    urls = list()
    details = list()
    headlines = list()

    dates = list()

    feedpost = soup.find('div', attrs={'class': 'column1 col col-10'})

    cards = feedpost.find_all('article')
    for card in cards:
        url = card.find("a")['href']
        urls.append(base_url + url)
        hl = card.find("h3").get_text().strip()
        headlines.append(hl)
        deet = card.find("p").get_text()
        details.append(deet)
        dt = card.find('span').get_text()
        dates.append(dt) #needs refactoring

    df = pd.DataFrame(list(zip(urls, headlines, details, dates)), 
                       columns =['url', 'headline', 'details', 'date'])


    today = df[df['date'].str.contains('EDT')]
    today['date'] = datetime.today().strftime('%d-%m-%Y')

    before = df[~df['date'].str.contains('EDT')]
    before['date'] = before['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%b %d %Y').strftime('%d-%m-%Y'))

    all_together = pd.concat([today, before])
    
    return(all_together)


In [268]:
url = 'https://www.reuters.com/news/archive/reutersComService?view=page&page=1&pageSize=10'
urls = make_url_list(url, num_pages=61)

all_data = pd.DataFrame()

for url in urls:
    print(url)
    page_data = get_claims(url)
    all_data = pd.concat([all_data, page_data])

all_data.to_csv('/Users/madelinecampbell/Documents/GitHub/projects/misinformation_2021/data/reuters.csv')


https://www.reuters.com/news/archive/reutersComService?view=page&page=1&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=2&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=3&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=4&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=5&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=6&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=7&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=8&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=9&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=10&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=11&pageSize=10
https://www.reuters.com/news/archive/reutersComService?view=page&page=12&p

In [259]:
url = 'https://www.reuters.com/news/archive/reutersComService?view=page&page=1&pageSize=10'

page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')

base_url = 'https://www.reuters.com/'
urls = list()
details = list()
headlines = list()

dates = list()

feedpost = soup.find('div', attrs={'class': 'column1 col col-10'})

cards = feedpost.find_all('article')
# for card in cards:
#     url = card.find("a")['href']
#     urls.append(base_url + url)
#     hl = card.find("h3").get_text().strip()
#     headlines.append(hl)
#     deet = card.find("p").get_text()
#     details.append(deet)
#     dt = card.find('span').get_text()
#     dates.append(dt) #needs refactoring

# df = pd.DataFrame(list(zip(urls, headlines, details, dates)), 
#                    columns =['url', 'headline', 'details', 'date'])


# today = df[df['date'].str.contains('EDT')]
# today['date'] = datetime.today().strftime('%d-%m-%Y')

# before = df[~df['date'].str.contains('EDT')]
# before['date'] = before['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%b %d %Y').strftime('%d-%m-%Y'))

# all_together = pd.concat([today, before])

In [260]:
cards[0]

<article class="story">
<div class="story-photo lazy-photo">
<a href="/article/uk-factcheck-side-effects/fact-check-video-misinterprets-report-into-health-impact-events-after-vaccination-idUSKBN29Q2B6">
<img alt="" border="0" org-src="https://s4.reutersmedia.net/resources/r/?m=02&amp;d=20210121&amp;t=2&amp;i=1548604297&amp;w=200&amp;fh=&amp;fw=&amp;ll=&amp;pl=&amp;sq=&amp;r=LYNXMPEH0K1A9" src="https://s1.reutersmedia.net/resources_v2/images/1x1.png"/>
</a>
</div><div class="story-content">
<a href="/article/uk-factcheck-side-effects/fact-check-video-misinterprets-report-into-health-impact-events-after-vaccination-idUSKBN29Q2B6">
<h3 class="story-title">
								Fact check: Video misinterprets report into health impact events after vaccination</h3>
</a>
<div class="contributor"></div>
<p>Correction, March 11, 2021: An earlier version of this check incorrectly attributed remarks in paragraphs nine and twelve to a spokeswoman from Pfizer. The remarks were in fact made by a spokeswoman fro

In [32]:
url = 'https://leadstories.com/cgi-bin/mt/mt-search.fcgi?IncludeBlogs=1&archive_type=Index'
page = requests.get(url)
soup = BeautifulSoup(page.content,'html.parser')

urls = list()

headlines = list()
details = list()

dates = list()

feedpost = soup.find('div', attrs={'class': 'mod-static-content-inner'})

cards = feedpost.find_all('article')


In [36]:
cards[0].find('small').get_text()

'Mar 18, 2021'

In [14]:
cards[0].find('p').get_text() #'div', {'class':'entry-meta'}).get_text()[3:].strip()

'A Food and Drug Administration presentation on monitoring the safety of COVID-19 vaccines listed possible adverse events the agency might track. But an Instagram post misrepresents the document, falsely claiming it shows the vaccines are known to cause harmful side effects — including death.'

In [254]:
url = 'https://factcheck.thedispatch.com/archive'

page = requests.get(url)

soup = BeautifulSoup(page.content, 'html.parser')

urls = list()
# base_url = 'https://checkyourfact.com'

headlines = list()

dates = list()

feedpost = soup.find('div', attrs={'class': 'portable-archive-list'})
cards = feedpost.find_all('div', {'class':'post-preview portable-archive-post has-image has-author-line'})

In [262]:
#feedpost

cards[0].find("a")['href']

'https://factcheck.thedispatch.com/p/did-the-washington-post-correct-a'

In [263]:
url = 'https://factcheck.thedispatch.com/p/did-the-washington-post-correct-a'
page2 = requests.get(url)
soup2 = BeautifulSoup(page2.content, 'html.parser')
title = soup2.find("h1").get_text()
dt = soup2.find("table").find('td', {'class': 'post-meta-item post-date'}).get_text()

'Did the Washington Post Correct a Report About Trump’s Call With the Georgia Secretary of State?'

In [272]:
soup2.find("table").find('td', {'class': 'post-meta-item post-date'}).get_text()

'13 hr ago'

In [240]:
# feedpost

# feedpost.find('features').find_all("a")[0]

feedpost = soup.find('atom')
cards = feedpost.find('articles').find_all("a")
cards[0].find("name").get_text()


# feedpost.find('articles')



In [213]:
dt_obj = cards[0]['href'][1:11] #[1].get_text() #.find("title").get_text()

datetime.strptime(dt_obj, '%Y/%m/%d').strftime('%d-%m-%Y')

'17-03-2021'

In [218]:
url[0:26]

'https://checkyourfact.com/'

In [176]:
df = pd.DataFrame(list(zip(urls, headlines, dates, claims)), 
                   columns =['url', 'headline', 'date', 'claim']) 

df.head()

Unnamed: 0,url,headline,date,claim
0,https://apnews.com/article/fact-checking-afs:C...,Video of Biden with reporters was not digitall...,"March 17, 2021 GMT",CLAIM: Video of President Joe Biden was digita...
1,https://apnews.com/article/fact-checking-afs:C...,Racist Facebook post attributed to Atlanta sho...,"March 17, 2021 GMT","THE CLAIM: On Tuesday, hours before police say..."
2,https://apnews.com/article/fact-checking-afs:C...,Misleading claims swirl around US election ove...,"March 17, 2021 GMT",As Congress considers a sweeping bid to overha...
3,https://apnews.com/article/fact-checking-afs:C...,US and EU COVID vaccines don’t contain aluminum,"March 16, 2021 GMT","CLAIM: COVID-19 vaccines contain aluminum, a t..."
4,https://apnews.com/article/fact-checking-afs:C...,"Columbia University holds a main graduation, a...","March 16, 2021 GMT",CLAIM: Columbia University is holding separate...


In [182]:
temp_dt = df.date[0]
temp_dt

'March 17, 2021 GMT'

In [187]:
dt_obj = datetime.strptime(temp_dt, '%B %d, %Y %Z')
dt_obj.strftime('%d-%m-%Y')

'17-03-2021'

In [190]:
df['date'] = df['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%B %d, %Y %Z').strftime('%d-%m-%Y'))



Unnamed: 0,url,headline,date,claim,date2
0,https://apnews.com/article/fact-checking-afs:C...,Video of Biden with reporters was not digitall...,"March 17, 2021 GMT",CLAIM: Video of President Joe Biden was digita...,17-03-2021
1,https://apnews.com/article/fact-checking-afs:C...,Racist Facebook post attributed to Atlanta sho...,"March 17, 2021 GMT","THE CLAIM: On Tuesday, hours before police say...",17-03-2021
2,https://apnews.com/article/fact-checking-afs:C...,Misleading claims swirl around US election ove...,"March 17, 2021 GMT",As Congress considers a sweeping bid to overha...,17-03-2021
3,https://apnews.com/article/fact-checking-afs:C...,US and EU COVID vaccines don’t contain aluminum,"March 16, 2021 GMT","CLAIM: COVID-19 vaccines contain aluminum, a t...",16-03-2021
4,https://apnews.com/article/fact-checking-afs:C...,"Columbia University holds a main graduation, a...","March 16, 2021 GMT",CLAIM: Columbia University is holding separate...,16-03-2021


In [269]:
first_url = 'https://www.factcheck.org/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw'

def make_url_list(first_url, num_pages=20):
    urls = [first_url]
    base_url = first_url[0:26]

    end_url = first_url[25:]

    for i in range(2,num_pages):
        modified_url = base_url + 'page/' + str(i) + end_url
        urls.append(modified_url)
        
    return(urls)

urls = make_url_list(first_url, num_pages=20)



In [270]:
urls

['https://www.factcheck.org/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/2/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/3/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/4/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/5/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/6/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/7/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/8/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/9/?fbclid=IwAR1PzRjDeRQyQc1eYBDC2pL3VXAGO8vo1ScycnWmyf1ibKY4Kk7InLD4JMw',
 'https://www.factcheck.org/page/10/?fbclid=

In [3]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime

def get_claims(url):
    '''
    URL taken from factcheck.afp.com
    
    This just extracts the url, headline and date from the list
    Could actually navigate to extracted URL and get more info
    
    Returns df with url, headline, date, time
    '''
    page = requests.get(url)

    soup = BeautifulSoup(page.content, 'html.parser')

    urls = list()
    base_url = 'https://apnews.com'

    headlines = list()

    dates = list()

    # times = list()

    claims = list()

    feedpost = soup.find('article', attrs={'class': 'feed-0-2-16 feed'})
    if feedpost is None:
        feedpost = soup.find('main')
    cards = feedpost.find('article').find_all('div', {'class': 'FeedCard Component-wireStory-0-2-104'})
    for card in cards:
        url = card.find("a")['href']
        urls.append(base_url + url)
        hl = card.find("h1").get_text()
        headlines.append(hl)
        dt = card.find_all("span")[1].get_text() 
        dates.append(dt) # needs refactoring
        cl = card.find("p").get_text()
        claims.append(cl)

    df = pd.DataFrame(list(zip(urls, headlines, dates, claims)), 
                       columns =['url', 'headline', 'date', 'claim']) 

    df['date'] = df['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%B %d, %Y %Z').strftime('%d-%m-%Y'))

    return(df)


url = 'https://apnews.com/hub/fact-checking'

df = get_claims(url)
df.to_csv('/Users/madelinecampbell/Documents/GitHub/projects/misinformation_2021/data/ap.csv')

In [4]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup


def make_url_list(base_URL, num_pages=20):
    '''
    base_URL :  URL from first page of AFP website
    num_pages : how many scrolling pages do you want to scrape (num extra URLs)
    
    returns list of URLs
    '''
    afp_urls = [base_URL]
    
    for i in range(1,num_pages):
        modified_url = base_URL + '&page=' + str(i)
        afp_urls.append(modified_url)
        
    return(afp_urls)


def get_claims(URL):
    '''
    URL taken from factcheck.afp.com
    
    This just extracts the url, headline and date from the list
    Could actually navigate to extracted URL and get more info
    
    Returns df with url, headline, date, time
    '''
    page = requests.get(URL)

    soup = BeautifulSoup(page.content, 'html.parser')

    urls = list()
    base_url = 'https://factcheck.afp.com'

    headlines = list()

    dates = list()

    times = list()

    featured_post = soup.find('div', attrs={'class': 'featured-post'})
    if featured_post is None:
        featured_post = soup.find('main')
    cards = featured_post.select(".card")
    for card in cards:
        url = card.find("a")['href']
        urls.append(base_url + url)
        hl = card.find("h4").get_text()[2:].strip()
        headlines.append(hl)
        dt = card.find("small").get_text().split(' ')[2] 
        dates.append(dt)
        tm = card.find("small").get_text().split(' ')[4]
        times.append(tm)

    df = pd.DataFrame(list(zip(urls, headlines, dates, times)), 
                   columns =['url', 'headline', 'date', 'time']) 
    
    df['date'] = df['date'].map(lambda dt_obj: datetime.strptime(dt_obj, '%d/%m/%Y').strftime('%d-%m-%Y'))


    return(df)



### run all the code

base_URL = 'https://factcheck.afp.com/afp-usa?fbclid=IwAR3In37HRkagU6Lc-63sRxwo7wq_lkAighsVI4EDmtDkwE6WoJEVg6bEDoc'

all_urls = make_url_list(base_URL, num_pages=24)

all_data = pd.DataFrame()

for url in all_urls:
    page_data = get_claims(url)
    all_data = pd.concat([all_data, page_data])
    

### save df to data folder   
all_data.to_csv('/Users/madelinecampbell/Documents/GitHub/projects/misinformation_2021/data/afp.csv')
