In [1]:
#### hunger map pdf reader

In [2]:
# import libraries:
import tabula as tab, pandas as pd, numpy as np

In [3]:
def extract_and_read_pdf():
    
    file = tab.read_pdf("global-summary.pdf",
                        pages="all",
                        silent = True)

    df0 = file[-1] # last tbl
    df1 = file[-2] # second to last tbl
    
    df0 = df0.iloc[4:,:]
    df1 = df1.iloc[4:,:]
    
    return([df0, df1]) # [0] and [1]

In [4]:
def create_and_rename_cols():
    
    column_names = {

         "Unnamed: 0":"country",
         "TOTAL":"population(M)",
         "PEOPLE WITH":"insufficient_food_consumption(M)",
         "PEOPLE USING CRISIS OR":"based_coping_strategies(M)",
         "PEOPLE USING CRISIS OR.1":"emergency_coping_strategies(M)",
         "PEOPLE REPORTING":"reporting_challenges(M)",
         "COVID‐19":"covid_19(000)",
         "CONFLICT":"conflict_related_fatalities(000)"
    }
    
    d0 = (extract_and_read_pdf()[0].copy()) # calling a first tbl
    d0.rename(mapper = column_names,
               axis = 1,
               inplace = True)
    d1 = (extract_and_read_pdf()[1].copy()) # calling a second tbl
    d1.rename(mapper = column_names,
               axis = 1,
               inplace = True)
    
    return([d0, d1])

In [5]:
def union_tbl():
    
    data0 = create_and_rename_cols()[0]
    data1 = create_and_rename_cols()[1]
    
    df = pd.concat(objs = [data0, data1],
                   ignore_index = True)
    
    return(df)

In [6]:
def status():
    
    def get_status(s):
        if "PREDICTED" in s:
            return("PREDICTED")
        elif "ACTUAL" in s:
            return("ACTUAL")
        else:
            return(np.nan)
        
    df = union_tbl()
    df["status"] = df["country"].apply(get_status)
    df["country"] = (

         df["country"].str.replace("ACTUAL", "", regex = True)
        .replace("PREDICTED", "", regex = True)
        .replace(",", "", regex = True)
    )
    
    return(df)

In [7]:
def prep_one():
    
    countries = {

         'Moldova': 'Republic of Moldova',
         'Papua New': 'Papua New Guinea',
         'Sao Tome and': 'Sao Tome and Principe',
         'Solomon': 'Solomon Islands',
         'South': 'South Sudan',
         'State of': 'State of Palestine',
         'Syrian Arab': 'Syrian Arab Republic',
         'Timor-': 'Timor- Leste',
         'United Republic': 'United Republic of Tanzania',
         'Central African': 'Central African Republic',
         'Democratic': 'Democratic Republic of the Congo',
         'Dominican': 'Dominican Republic',
         'Guinea-': 'Guinea-Bissau',
         'Iran (Islamic': 'Iran (Islamic Republic of)',
         "Lao People's": "Lao People's Democratic Republic"
    }

    df = status()
    df["status"].fillna(method = 'backfill',
                        inplace = True)
    df_main = df.copy()
    df_main = (
        df_main.replace({"country" : countries}))
    
    return(df_main)

In [8]:
def prep_two():
    
    df_main = prep_one()
    first_column = df_main.pop('status')
    df_main.insert(1, 'status', first_column)
    df_main = df_main.dropna(how='any')
    
    df_main = df_main.replace("—",
                              np.nan,
                              regex = True)
    return(df_main)

In [9]:
def prep_final():
    
    df_main = prep_two()

    df_main["population(M)"] = df_main["population(M)"].str.replace("*",
                                                                    "",
                                                                    regex = True)
    df_main["insufficient_food_consumption(M)"] = df_main["insufficient_food_consumption(M)"].str.replace("*",
                                                                                                          "",
                                                                                                          regex = True)
    df_main["based_coping_strategies(M)"] = df_main["based_coping_strategies(M)"].str.replace("*",
                                                                                              "",
                                                                                              regex = True)
    df_main["emergency_coping_strategies(M)"] = df_main["emergency_coping_strategies(M)"].str.replace("*",
                                                                                                      "",
                                                                                                      regex = True)
    df_main["reporting_challenges(M)"] = df_main["reporting_challenges(M)"].str.replace("*",
                                                                                        "",
                                                                                        regex = True)
    df_main["covid_19(000)"] = df_main["covid_19(000)"].str.replace("*",
                                                                            "",
                                                                            regex = True)
    df_main["conflict_related_fatalities(000)"] = df_main["conflict_related_fatalities(000)"].str.replace("*",
                                                                                                                  "",
                                                                                                                  regex = True)
    df_main.sort_values(by = "country",
                        ascending = True,
                        inplace = True)
    
    return(df_main)

In [10]:
def save_file(name):
    
    try:
        df = prep_final()
        excel_file = pd.ExcelWriter(name,
                                    engine = "xlsxwriter")
        df.to_excel(excel_file,
                   sheet_name = "main",
                   index = False,
                   encoding = "utf-8")

        workbook = excel_file.book
        worksheet = excel_file.sheets["main"]
        fr = workbook.add_format({"align" : "center"})
        worksheet.set_column("A:I",
                             None,
                             fr)
        worksheet.set_column('A:I', 
                             32)

        excel_file.save()
        print(f"Excel file {name} properly saved!")
        
    except:
        
        print("ERROR, file not properly saved!")
        
save_file("hunger_map.xlsx") # specify the name of the file