In [112]:
from bs4 import BeautifulSoup
import requests
import re
import textract
import pandas as pd
import os

In [2]:
link = "https://www.nytimes.com/interactive/2021/us/civilian-casualty-files.html"

In [6]:
b = requests.get(link)

In [9]:
doc = BeautifulSoup(b.text)

In [120]:
def extract_pdf(pdf_path: str) -> str:
    if len(pdf_path) == 0:
        return ""
    return textract.process(pdf_path, encoding="utf-8").decode("utf-8")

In [23]:
files = doc.find('div', attrs={'data-preview-slug': '2021-11-12-airstrikes-docs'})

In [29]:
KINDS = ['credible-reports', 'noncredible-reports', 'process-docs']
reports = {}
for kind in KINDS:
    reports[kind] = files.find('div', attrs={'id': kind}).find('div', id=lambda x: x.endswith("-table"))

In [82]:
ROW_TITLES = [
    'date',
    'location',
    'killed',
    'injured',
]

def parse_table(reports_table):
    rows = reports_table.findAll('div', recursive=False)
    data = []
    for r in rows[1:]:
        if not r:
            continue
        d = {}
        
        for title in ROW_TITLES:
            d[title] = r.find('div', class_=lambda x: x.endswith(title)).text.strip() 
        try:
            d['docs'] = r.find('div', class_=lambda x: x.endswith('docs'))
            if len(d['docs'].text.strip()) > 0:
                d['docs'] = d['docs'].find('a').attrs['href']
            else:
                d['docs'] = ''
            d['pr'] = r.find('div', class_=lambda x: x.endswith('pr'))
            if len(d['pr'].text.strip()) > 0:
                d['pr'] = d['pr'].find('a').attrs['href']
            else:
                d['pr'] = ''
        except:
            print(f"didn't find {r}")
        data.append(d)
        
    return pd.DataFrame(data)

In [105]:
parse_table(reports['credible-reports']).head()

Unnamed: 0,date,location,killed,injured,docs,pr
0,"Nov. 5, 2014","Harim, Syria",2,2.0,https://int.nyt.com/data/documenttools/c-11-5-...,https://int.nyt.com/data/documenttools/press-r...
1,"March 13, 2015","Hatra, Iraq",11,,https://int.nyt.com/data/documenttools/c-3-13-...,https://int.nyt.com/data/documenttools/press-r...
2,"April 12, 2015","Hawija, Iraq",2,,https://int.nyt.com/data/documenttools/c-4-12-...,https://int.nyt.com/data/documenttools/press-r...
3,"May 13, 2015","Baiji, Iraq",6,,https://int.nyt.com/data/documenttools/c-5-13-...,https://int.nyt.com/data/documenttools/press-r...
4,"June 2, 2015","Hawija, Iraq",70,,https://int.nyt.com/data/documenttools/c-6-2-1...,https://int.nyt.com/data/documenttools/press-r...


In [113]:
def download_pdf(link):
    if len(link) == 0: 
        return ''
    out = '-'.join(link.split('/')[5:])
    out = f"./downloads/{out}"
    if os.path.exists(out):
        return out
    with open(out, "wb+") as outfile:
        r = requests.get(link)
        outfile.write(r.content)
    return out
        
def download_and_parse(df):
    df['doc_file'] = df['docs'].apply(download_pdf)
    df['pr_file'] = df['pr'].apply(download_pdf)

    return df

In [95]:
#'/'.join('https://int.nyt.com/data/documenttools/c-11-5-14-syria/33f211120650b542/full.pdf'.split("/")[5:])

'c-11-5-14-syria/33f211120650b542/full.pdf'

In [104]:
#download_pdf('https://int.nyt.com/data/documenttools/c-11-5-14-syria/33f211120650b542/full.pdf')

In [114]:
credible_reports = parse_table(reports['credible-reports'])

In [115]:
credible_reports = download_and_parse(credible_reports)

In [117]:
credible_reports.head()

Unnamed: 0,date,location,killed,injured,docs,pr,doc_file,pr_file
0,"Nov. 5, 2014","Harim, Syria",2,2.0,https://int.nyt.com/data/documenttools/c-11-5-...,https://int.nyt.com/data/documenttools/press-r...,./downloads/c-11-5-14-syria-33f211120650b542-f...,./downloads/press-release-5-21-2015-9cda9f1a51...
1,"March 13, 2015","Hatra, Iraq",11,,https://int.nyt.com/data/documenttools/c-3-13-...,https://int.nyt.com/data/documenttools/press-r...,./downloads/c-3-13-15-iraq-98765d6845453fbb-fu...,./downloads/press-release-4-30-2017-ccdbcdad55...
2,"April 12, 2015","Hawija, Iraq",2,,https://int.nyt.com/data/documenttools/c-4-12-...,https://int.nyt.com/data/documenttools/press-r...,./downloads/c-4-12-15-iraq-74481cf840344fc8-fu...,./downloads/press-release-1-15-2016-b1abcd39f5...
3,"May 13, 2015","Baiji, Iraq",6,,https://int.nyt.com/data/documenttools/c-5-13-...,https://int.nyt.com/data/documenttools/press-r...,./downloads/c-5-13-15-iraq-437a8a6b6adb3ee6-fu...,./downloads/press-release-7-7-2017-e4cb887cba3...
4,"June 2, 2015","Hawija, Iraq",70,,https://int.nyt.com/data/documenttools/c-6-2-1...,https://int.nyt.com/data/documenttools/press-r...,./downloads/c-6-2-15-iraq-055d09f8f8b256a4-ful...,./downloads/press-release-12-5-2019-50dc7f4f2c...


In [118]:
def extract_pdf_from_pandas(df):
    df['doc_text'] = df['doc_file'].apply(extract_pdf)
    df['pr_text'] = df['pr_file'].apply(extract_pdf)
    return df

In [122]:
def write_output(out_path, df):
    with open(out_path, "w+") as out:
        for row in df.itertuples():
            date = row.date
            loc = row.location
            killed = row.killed
            injured = row.injured
            doc_text = row.doc_text
            pr_text = row.pr_text
            
            header_str = f"{date} - {location}\nKilled: {killed}\nInjured: {injured}\n"
            out.write(header_str)
            out.write("DOCUMENT TEXT BELOW")
            out.write(doc_text)
            out.write("PR TEXT BELOW")
            out.write(pr_text)
            out.write("----------------------------------------------")

In [123]:
credible_reports = extract_pdf_from_pandas(credible_reports)
write_output("./output/credible_reports.txt", credible_reports)

AttributeError: 'NoneType' object has no attribute 'itertuples'