In [1]:
import numpy as np
import pandas as pd
from tabula import read_pdf
import pickle
import fitz

In [2]:
mainIndicatorList = ['Human Freedom','Economic Freedom','Rule of Law', 'Security and Safety', 'Movement','Religious Freedom','Assoc., Assembly, and Civil Soc.','Expression and Information','Identity and Relationships','','']
lists = ['Country/Territory' , 'Personal Freedom', 'PF Rank', 'Economic Freedom', 'EF Rank', 'Human Freedom','HF Rank']

In [3]:
def get_data(page_no):
    try :
        assert page_no >= 0
        assert page_no <=420
        
        print("Getting data for page no. : ", page_no , "\n")
        
        df = read_pdf('hfi2020.pdf',pages=str(page_no))
        df = df[0]
        df.rename( columns={'Unnamed: 0':'Indicator'}, inplace=True )
        df = df.dropna()
        
        def strip(text):
            try:
                return text.replace(" ", "")

            except AttributeError:
                return text
            
        for ind, column in enumerate(df.columns):
            if ind>0:
                df[column]=df[column].apply(lambda x : strip(x))
                
        df = df.reset_index()
        df = df.drop('index',axis=1)
        print("Data for pageno ", page_no, " successfully imported.\n","--"*40,"\n")
        return df
    
    except AssertionError:
        print(f"{page_no} looks Like an Invalid Page number")

In [9]:
def clean_transpose(dictionary):

    country_names    = list(dictionary.keys())
    updated_datadict = {}
    for i in country_names:

        indicator_name = list(dictionary[i].transpose().iloc[0])
        transposed_df  = dictionary[i].replace('-', np.nan).transpose().iloc[1:]
        transposed_df.columns = indicator_name
        transposed_df  = transposed_df.transform(lambda x: pd.to_numeric(x))
        transposed_df  = transposed_df.transform(lambda x : x.fillna(x.mean()))
        updated_datadict[i]=transposed_df
        
    return updated_datadict

In [5]:
def store_data(datadict, filename = 'HFI_Countries.pickle'):
    import pickle

    # Store data (serialize)
    with open(filename, 'wb') as handle:
        pickle.dump(datadict, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # Load data (deserialize)
    with open(filename, 'rb') as handle:
        unserialized_data = pickle.load(handle)

    return print("--"*18 + "Done" + "--"*18)


In [6]:
datadict={}

doc = fitz.open('hfi2020.pdf')
for i in range(57,364,2):
    page = doc.loadPage(i-1)
    text = page.getText('blocks')
    countryname=text[-1][-3].split("\n")[0]
    print(countryname)
    df   = get_data(i)
    datadict[countryname] = df

for i in range(364,380,2):
    page = doc.loadPage(i)
    text = page.getText('blocks')
    countryname = text[-1][-3].split("\n")[2]
    print(countryname)
    df = get_data(i)
    datadict[countryname] = df
    
doc.close()

Getting data for page no. :  57 

Data for pageno  57  successfully imported.
 -------------------------------------------------------------------------------- 

ALBANIA
Getting data for page no. :  59 

Data for pageno  59  successfully imported.
 -------------------------------------------------------------------------------- 

ALGERIA
Getting data for page no. :  61 

Data for pageno  61  successfully imported.
 -------------------------------------------------------------------------------- 

ANGOLA
Getting data for page no. :  63 

Data for pageno  63  successfully imported.
 -------------------------------------------------------------------------------- 

ARGENTINA
Getting data for page no. :  65 

Data for pageno  65  successfully imported.
 -------------------------------------------------------------------------------- 

ARMENIA
Getting data for page no. :  67 

Data for pageno  67  successfully imported.
 ----------------------------------------------------------------------

Data for pageno  153  successfully imported.
 -------------------------------------------------------------------------------- 

ETHIOPIA
Getting data for page no. :  155 

Data for pageno  155  successfully imported.
 -------------------------------------------------------------------------------- 

FIJI
Getting data for page no. :  157 

Data for pageno  157  successfully imported.
 -------------------------------------------------------------------------------- 

FINLAND
Getting data for page no. :  159 

Data for pageno  159  successfully imported.
 -------------------------------------------------------------------------------- 

FRANCE
Getting data for page no. :  161 

Data for pageno  161  successfully imported.
 -------------------------------------------------------------------------------- 

GABON
Getting data for page no. :  163 

Data for pageno  163  successfully imported.
 -------------------------------------------------------------------------------- 

GAMBIA, THE
Gett

Data for pageno  249  successfully imported.
 -------------------------------------------------------------------------------- 

MAURITIUS
Getting data for page no. :  251 

Data for pageno  251  successfully imported.
 -------------------------------------------------------------------------------- 

MEXICO
Getting data for page no. :  253 

Data for pageno  253  successfully imported.
 -------------------------------------------------------------------------------- 

MOLDOVA
Getting data for page no. :  255 

Data for pageno  255  successfully imported.
 -------------------------------------------------------------------------------- 

MONGOLIA
Getting data for page no. :  257 

Data for pageno  257  successfully imported.
 -------------------------------------------------------------------------------- 

MONTENEGRO
Getting data for page no. :  259 

Data for pageno  259  successfully imported.
 -------------------------------------------------------------------------------- 

MOROCC

Data for pageno  345  successfully imported.
 -------------------------------------------------------------------------------- 

TANZANIA
Getting data for page no. :  347 

Data for pageno  347  successfully imported.
 -------------------------------------------------------------------------------- 

THAILAND
Getting data for page no. :  349 

Data for pageno  349  successfully imported.
 -------------------------------------------------------------------------------- 

TIMOR-LESTE
Getting data for page no. :  351 

Data for pageno  351  successfully imported.
 -------------------------------------------------------------------------------- 

TOGO
Getting data for page no. :  353 

Data for pageno  353  successfully imported.
 -------------------------------------------------------------------------------- 

TRINIDAD AND TOBAGO
Getting data for page no. :  355 

Data for pageno  355  successfully imported.
 -------------------------------------------------------------------------------

In [10]:
store_data(clean_transpose(datadict))


----------------------------------------Done----------------------------------------
