In [1]:
import pandas as pd
import numpy as np
import os
from tabula import read_pdf
import re
from dateutil import parser
from datetime import datetime
from datetime import date

# pdf examples 090520 080920 040720 120420 170620 210320 240820

In [2]:
# READ PDF

def get_table(file_number):

    table = read_pdf(directory + '/DL' + file_number + '.pdf', pages='2')
    #table[0]
    df = pd.DataFrame(table[0])
    return df

In [3]:
# GET INDEX OF 'ACEH'

def getIndexes(dfObj, value):
    listOfPos = list()
    result = dfObj.isin([value])
    seriesObj = result.any()
    columnNames = list(seriesObj[seriesObj == True].index)
    for col in columnNames:
        rows = list(result[col][result[col] == True].index)
        for row in rows:
            listOfPos.append((row, col))
    return listOfPos

In [4]:
def process_table(df):

    pos_aceh = getIndexes(df,'ACEH')[0][0]

    # DROP SOME UNUSED ROWS AND COLUMNS

    drop_columns1 = [df.columns[0], df.columns[2]]
    drop_rows = list(range(0, pos_aceh))
    df = df.drop(drop_rows)
    df = df.drop(labels=drop_columns1, axis=1)

    # RENAME COLUMNS 'PROVINSI' AND COLUMS THAT CONTAIN THE NUMBERS

    for i in range(len(df.columns)):
        if i == 0 : df.rename(columns={df.columns[i]: 'PROVINSI'}, inplace=True)
        else :
                df.rename(columns={df.columns[i]: 'TEMP' + str(i)}, inplace=True)

    # SPLIT THE NUMBERS IN THE CONTAINING COLUMNS AND PLACE THE NUMBERS ON SEPARATE COLUMNS

    for i in np.arange(1, len(df.columns)):
        s = df['TEMP' + str(i)].str.split(' ', expand = True)
        y = 0
        for z in np.arange(len(s.columns)):
            df[str(i) + '-' + str(z)] = s[y]
            y = y + 1

    # DROP COLUMNS CONTAINING ONLY RESULTING FROM THE SPLITTING PROCESS

    df.replace("", np.nan, inplace=True)
    df.dropna(how='all', axis=1, inplace=True)

    # DROP TEMP COLUMNS

    columns_list = list(df.columns)
    columns_list
    temp_index = [i for i, word in enumerate(columns_list) if word.startswith('TEMP')]
    temp_drops = []
    
    for i in temp_index:
        temp_drops.append(df.columns[i])

    df = df.drop(temp_drops, axis=1)

    # RENAME 'TOTAL' ROW AND ADD 'TGL' COLUMN

    df.iloc[-1][0] = 'TOTAL'
    df["TGL"] = file_number[:2] + "-" + file_number[2:4] + "-" + file_number[4:]

    [parser.parse(i) for i in df['TGL']]
    df['TGL'] = [datetime.strptime(i, '%d-%m-%y') for i in df['TGL']]
    df['TGL'] = [i.date() for i in df['TGL']]

    # RENAME NUMBER COLUMNS NAMES

    number_column_names = ['PSTF H-1', 'PSTF', 'PSTF KUM', 'SMBH H-1', 'SMBH', 'SMBH KUM',
                        'MNGL H-1', 'MNGL', 'MNGL KUM']

    for i in np.arange(1, len(df.columns)-1):
        df = df.rename(columns={df.columns[i]:number_column_names[i-1]})

    # CLEAN ROWS AT THE END

    pos_gorontalo = getIndexes(df,'GORONTALO')[0][0]
    pos_total = getIndexes(df, 'TOTAL')[0][0]
    pos_total

    drop_rows_end = np.arange(pos_gorontalo + 1, pos_total)
    df = df.drop(drop_rows_end)

    # REMOVE MULTIPLE WHITESPACES ON 'PROVINSI'
    
    df = df.replace(to_replace=r'\s\s+', value=' ', regex=True)
    
    # ADD 'DPRIKS' COLUMN

    df['DPRIKS'] = np.NaN

    # GET THE 'DIPERIKSA' DATA
    # READ PDF SPECIFIC AREA OF PAGE 1

    box = [212.00, 325.00, 300.00, 600.00]
    extract_data = read_pdf(directory + '/DL' + file_number + '.pdf', pages='1', area=[box], stream=True)

    # FIND THE 'DIPERIKSA' DATA AND CLEAN IT

    string_extract = ''.join(map(str, extract_data))
    diperiksa_find = '(diperiksa?\s*:\s*[0-9.,]+)'
    diperiksa_data = re.findall(diperiksa_find, string_extract)
    diperiksa_data = diperiksa_data[0].replace(',', '').replace('.','').replace('diperiksa','').replace(':','').strip()
    diperiksa_data = int(diperiksa_data)

    # ADD 'DIPERIKSA' DATA TO TABLE

    df.at[df[df['PROVINSI']=='TOTAL'].index.values.astype(int), 'DPRIKS'] = diperiksa_data

    # RESET INDEX

    df = df.reset_index(drop=True)

    # CONVERT ALL COLUMN TYPES

    convert_columns = {'PSTF H-1': int, 'PSTF': int, 'PSTF KUM': int, 'SMBH H-1': int, 'SMBH': int,
                       'SMBH KUM': int, 'MNGL H-1': int, 'MNGL': int, 'MNGL KUM': int} 
    df = df.astype(convert_columns)
    return df

In [5]:
def raw_to_excel(df):
    
    # SAVE RAW DATA TABLE TO EXCEL FILE
 
    df.to_excel(directory + '/RAW' + file_number + '.xlsx')
    print("processing RAW" + file_number + " done!")

In [6]:
def clean_table(df):
    
    # DROP MORE UNUSED ROWS AND COLUMNS

    drop_columns2 = [df.columns[1], df.columns[4], df.columns[7], df.columns[11]]
    df_clean = df.drop(labels=drop_columns2, axis=1)
    df_clean = df_clean.drop(getIndexes(df,'TOTAL')[0][0])
    
#     for count, entry in enumerate(df['PROVINSI']):
#         sub = re.sub('\s\s+', ' ', entry)
#         df['PROVINSI'][count] = sub
    
    return df_clean

In [7]:
def clean_to_excel(df_clean):
    
#  SAVE CLEAN DATA TABLE TO EXCEL FILE

    df_clean.to_excel(directory + '/CL' + file_number + '.xlsx')
    print('processing CL' + file_number + ' done!')

In [8]:
# CFR = MNGL/PSTF x 100% ... case fatality rate
# PR = PSTF/DPRIKS x 100% ... positivity rate
# NGTF = DPRIKS - PSTF

def get_total(df):
    
    # CREATE DATAFRAME FOR TOTAL NATIONAL NUMBERS

    df_total = df[(df['PROVINSI'] == 'TOTAL')]
    drop_columns3 = [df.columns[0], df.columns[1], df.columns[4], df.columns[7]]
    df_total = df_total.drop(labels=drop_columns3, axis=1)
    df_total['DPRIKS'] = df_total['DPRIKS'].astype(int)

    # CALCULATE INDICES

    df_total['PR KUM'] = 100 * df_total['PSTF KUM'] / df_total['DPRIKS']
    df_total['CFR HRN'] = 100 * df_total['MNGL'] / df_total['PSTF']
    df_total['CFR KUM'] = 100 * df_total['MNGL KUM'] / df_total['PSTF KUM']
#     df_total['NGTF'] = df_total['DPRIKS'] - df_total['PSTF']

    #  REORDER COLUMNS

    df_total = df_total[['DPRIKS', 'PSTF', 'PSTF KUM', 'SMBH', 'SMBH KUM', 'MNGL', 'MNGL KUM', 'PR KUM',
                         'CFR HRN', 'CFR KUM', 'TGL']]
    return df_total

In [9]:
def process_all(files):
    df = get_table(file_number)
    df = process_table(df)
    df_clean = clean_table(df)
    df_total = get_total(df)
    raw_to_excel(df)
    clean_to_excel(df_clean)
    return df_total, df_clean

In [10]:
error_files = []
error_msgs = []

# SET THE FOLDER NAME

month_name = 'Oktober'
directory = 'Downloaded/' + month_name

# PREPARE ARRAYS WHERE THE DATA IS POPULATED

clean_columns = ['PROVINSI', 'PSTF', 'PSTF KUM', 'SMBH', 'SMBH KUM', 'MNGL', 'MNGL KUM', 'TGL']
total_columns = ['DPRIKS', 'PSTF', 'PSTF KUM', 'SMBH', 'SMBH KUM', 'MNGL', 'MNGL KUM', 'PR KUM', 'CFR HRN', 'CFR KUM', 'TGL']

agg_df_clean = pd.DataFrame(columns=clean_columns)
agg_df_total = pd.DataFrame(columns=total_columns)

# IMPLEMENT FUNCTION WITH ITERATION OF FILES IN FOLDER

for file in os.listdir(directory):
    file_number = file[2:8]
    print('Processing ' + file_number)
    try:
        df_total, df_clean = process_all(file_number)
        agg_df_clean = agg_df_clean.append(df_clean, ignore_index=True)
        agg_df_total = agg_df_total.append(df_total, ignore_index=True)
    except Exception as e:
        error_files.append(file_number)
        error_msgs.append(e)
        print(e)
        continue
        
# HANDLE LOG OF ERRORS

error_dict = {'Error Files': error_files, 'Error Messages': error_msgs}
df_error = pd.DataFrame(data=error_dict)
df_error.to_excel(directory + '/ErrorLog_' + month_name + '.xlsx')

# WRITE AGGREGATED DATA TO EXCEL

agg_df_clean.to_excel(directory + '/AggCL_' + month_name + '.xlsx')
agg_df_total.to_excel(directory + '/AggTTL_' + month_name + '.xlsx')

Processing 091020


Got stderr: Oct 14, 2020 4:32:21 PM org.apache.pdfbox.pdfparser.COSParser parseXref
Oct 14, 2020 4:32:22 PM org.apache.pdfbox.pdfparser.COSParser validateStreamLength

Got stderr: Oct 14, 2020 4:32:24 PM org.apache.pdfbox.pdfparser.COSParser parseXref
Oct 14, 2020 4:32:25 PM org.apache.pdfbox.pdfparser.COSParser validateStreamLength
Oct 14, 2020 4:32:25 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14



processing RAW091020 done!
processing CL091020 done!
Processing 101020


Got stderr: Oct 14, 2020 4:32:25 PM org.apache.pdfbox.pdfparser.COSParser parseXref
Oct 14, 2020 4:32:26 PM org.apache.pdfbox.pdfparser.COSParser validateStreamLength

Got stderr: Oct 14, 2020 4:32:28 PM org.apache.pdfbox.pdfparser.COSParser parseXref
Oct 14, 2020 4:32:28 PM org.apache.pdfbox.pdfparser.COSParser validateStreamLength
Oct 14, 2020 4:32:28 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14



processing RAW101020 done!
processing CL101020 done!
Processing 111020


Got stderr: Oct 14, 2020 4:32:29 PM org.apache.pdfbox.pdfparser.COSParser parseXref
Oct 14, 2020 4:32:29 PM org.apache.pdfbox.pdfparser.COSParser validateStreamLength

Got stderr: Oct 14, 2020 4:32:32 PM org.apache.pdfbox.pdfparser.COSParser parseXref
Oct 14, 2020 4:32:32 PM org.apache.pdfbox.pdfparser.COSParser validateStreamLength
Oct 14, 2020 4:32:32 PM org.apache.fontbox.ttf.CmapSubtable processSubtype14



processing RAW111020 done!
processing CL111020 done!
Processing 121020
list index out of range
