### Plot the Deaths with Demographics

In [1]:
import tabula
import requests
import datetime
import pandas as pd
import json
import os
from dateutil import parser
from datetime import datetime


from filecmp import cmp 
from pathlib import Path

In [2]:
url = 'https://www.sandiegocounty.gov/content/dam/sdc/hhsa/programs/phs/Epidemiology/COVID-19%20Deaths%20by%20Demographics.pdf'
pdf= requests.get(url)
with open(f'./Data/Deaths_by_demographics/Temp/deaths_by_demographics_{datetime.now().strftime("%d-%b-%Y-%H_%M_%S")}.pdf','wb') as f:
    f.write(pdf.content)



In [3]:
latest_pdfs = !ls -Art1 ./Data/Deaths_by_demographics/*.pdf
latest_pdfs_paths = [Path(pdf) for pdf in latest_pdfs]

def get_date_from_pdf(pdf):
    try:
        pdf_download_date = datetime.strptime("".join(str(pdf.name).split('_')[3:]).split('.')[0],"%d-%b-%Y-%H%M%S") 
    except ValueError as e:
        pdf_download_date = datetime.strptime("".join(str(pdf.name).split('_')[3:]).split('.')[0],"%Y-%M-%d") 
    return pdf_download_date

latest_pdfs_time = [get_date_from_pdf(pdf) for pdf in latest_pdfs_paths] 
latest_pdf = latest_pdfs_paths[latest_pdfs_time.index(max(latest_pdfs_time))]

In [4]:

downloaded_pdfs = !ls -Art1 ./Data/Deaths_by_demographics/Temp/*.pdf | tail -1
downloaded_pdf = Path(downloaded_pdfs[0])
print(f'Latest PDF {latest_pdf}')
print(f'Downloaded PDF {downloaded_pdf}')

Latest PDF Data/Deaths_by_demographics/deaths_by_demographics_09-Aug-2020-18_10_38.pdf
Downloaded PDF Data/Deaths_by_demographics/Temp/deaths_by_demographics_10-Aug-2020-19_13_54.pdf


In [5]:
def tabula_convert_pdf_to_df(pdf):
    pdf_download_date = datetime.strptime("".join(str(pdf.name).split('_')[3:]).split('.')[0],"%d-%b-%Y-%H%M%S") 
    
    
    raw_df, title = tabula.read_pdf_with_template(pdf,
                                              './Data/Deaths_by_demographics/deaths_by_demographics_16-May-2020-20_06_10.tabula-template.json',
                                              pages=1,stream=True)
    
    deaths_by_age = (raw_df[raw_df['Unnamed: 0'].str.contains("years", na=False)]).reset_index(drop=True)
#    deaths_by_age = raw_df.iloc[5:14].reset_index(drop=True)
    deaths_by_age.columns = ['Age Group','Count-percent']

    deaths_by_gender = raw_df[raw_df['Unnamed: 0'].isin(['Male','Female','Gender Unknown'])].reset_index(drop=True)
    deaths_by_gender.columns = ['Gender','Count-percent']


    
    deaths_by_race = raw_df[raw_df['Unnamed: 0'].isin(['Hispanic or Latino', 'White', 'Black or African American',
                                                        'Asian', 'Pacific Islander', 'American Indian', 'Multiple Race',
                                                        'Race/Ethnicity Unknown'])].reset_index(drop=True)
    deaths_by_race.columns = ['Race','Count-percent']
    for df in [deaths_by_age,deaths_by_gender, deaths_by_race]:
        df[['Count','Percent']] =df['Count-percent'].str.split(expand=True)
        df.drop(columns = 'Count-percent',inplace=True)
        df['Count'] = df['Count'].apply(lambda x: int(x) if x else x)
        df['Percent'] = df['Percent'].apply(lambda x: float(x.replace('%','')) if x else x )
        
    deaths_by_age['Age Group'] = deaths_by_age['Age Group'].apply(lambda x: x.replace('years',''))
    deaths_df = pd.concat([deaths_by_age, deaths_by_gender, deaths_by_race], keys=['Deaths-Age', 'Deaths-Gender', 'Deaths-Race'])
    deaths_df = deaths_df.reset_index()
    
    #Parse Dates
    title=title.columns[0]
    dates = []
    for _ in title.split():
        try:
            dates.append(parser.parse(_, fuzzy=True))
        except Exception as e:
            pass
    date = dates[0]

    updated_time = datetime.combine(dates[1],datetime.time(dates[2]))

    deaths_df['ReportedDate'] = date 
    deaths_df['UpdatedDatetime'] = updated_time
    deaths_df = deaths_df.rename(columns={'level_0':'Type', 'level_1':'Index'})
    return deaths_df

    
    

In [6]:
# First time
#new_deaths_df = tabula_convert_pdf_to_df(latest_pdf)
#new_deaths_df.to_json('./Data/Deaths_by_demographics/all_dates_deaths_df.json')
#new_deaths_df


In [7]:
from filecmp import cmp

# If downloaded file is the same as the latest pdf, then delete the file and do nothing
if cmp(downloaded_pdf, latest_pdf, shallow=True):
    print('File already exists. Deleting the temp file')
    os.remove(downloaded_pdf)
    
# If downloaded file is different from the latest pdf, then 
else:
    print('Downloaded file is unique')
    # step 1: Open old json file
    try:
        with open('./Data/Deaths_by_demographics/all_dates_deaths_df.json') as f:
            old_deaths_df = pd.read_json(f, convert_dates=['ReportedDate','UpdateDatetime'])
    except:
        old_deaths_df = pd.DataFrame()

    # Step2 read from latest pdf file 
    new_deaths_df = tabula_convert_pdf_to_df(downloaded_pdf)
    
    ## Step 3 add the new dataframe to old dataframe
    updated_deaths_df = pd.concat([old_deaths_df,new_deaths_df])
    updated_deaths_df = updated_deaths_df.sort_values(by='ReportedDate', ascending=False).reset_index(drop=True)
    updated_deaths_df
    ## Step 4 Write the a new json f
    updated_deaths_df.to_json('./Data/Deaths_by_demographics/all_dates_deaths_df.json')
    
    # Step 5
    print('Moving the file from TEMP directory to directory with all unique pdfs')
    os.rename(downloaded_pdf,latest_pdf.parent/downloaded_pdf.name)

Downloaded file is unique
Moving the file from TEMP directory to directory with all unique pdfs


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  deaths_df = pd.concat([deaths_by_age, deaths_by_gender, deaths_by_race], keys=['Deaths-Age', 'Deaths-Gender', 'Deaths-Race'])
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  updated_deaths_df = pd.concat([old_deaths_df,new_deaths_df])


In [8]:
 with open('./Data/Deaths_by_demographics/all_dates_deaths_df.json','r') as f:
    all_deaths_df = pd.read_json(f,convert_dates=['ReportedDate', 'UpdatedDatetime'])
#all_deaths_df = all_deaths_df[all_deaths_df['ReportedDate'] != '2020-05-26']
#all_deaths_df.to_json('./Data/Deaths_by_demographics/all_dates_deaths_df.json')
all_deaths_df

Unnamed: 0,Age Group,Count,Gender,Index,Percent,Race,ReportedDate,Type,UpdatedDatetime
0,,2,,3,0.3,Multiple Race,2020-08-09,Deaths-Race,2020-08-10 08:00:00
1,,25,,2,4.2,Black or African American,2020-08-09,Deaths-Race,2020-08-10 08:00:00
2,,229,,1,38.7,White,2020-08-09,Deaths-Race,2020-08-10 08:00:00
3,,269,,0,45.4,Hispanic or Latino,2020-08-09,Deaths-Race,2020-08-10 08:00:00
4,,0,Gender Unknown,1,,,2020-08-09,Deaths-Gender,2020-08-10 08:00:00
...,...,...,...,...,...,...,...,...,...
1399,,0,Gender Unknown,2,,,2020-05-14,Deaths-Gender,2020-05-15 08:00:00
1400,,117,Male,1,56.3,,2020-05-14,Deaths-Gender,2020-05-15 08:00:00
1401,20-29,2,,2,1.0,,2020-05-14,Deaths-Age,2020-05-15 08:00:00
1402,,81,,0,42.9,Hispanic or Latino,2020-05-14,Deaths-Race,2020-05-15 08:00:00
