In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [3]:
'''
Procedure: removeGranular
Inputs:
    df         Dataframe     
Outputs:
    Dataframe
Purpose:
Imported data from the Food Balance Sheet of the Food and Agriculture Organization of the United Nations
is filtered to exclude data not needed in this analysis. That is, granular details such as specific fruits, 
veggies, and dairy are removed. As are pre-categories country groupings. 
Unused columns are also dropped for reasons specified below
'''
def removeGranular(df):
    df = df.loc[(
                   (df['Area Code']<1000)  # only keep the countries. All regions are dropped 
                 & ((df['Item Code']>2900) # only keep category groups.
                 & (df['Unit']!='kg'))     # do not keep kg data (using analysis is on calories per person)
                 | ((df['Unit']=='g/capita/day') & (df['Item Code']==2901)) # or keep Total Protein data.
                )]
    df = df.drop([
                  'Area Code',   # Area Code is a numeric country code not used elsewhere 
                  'Flag',        # Flag is quality of the data
                  'Year Code'    # Duplicate of Year
                 ],axis=1)       # indicator that columnns are to be dropped
    return df

In [10]:
'''
Procedure: loadFiles
Inputs:
    None
Outputs:
    Tuple     two dataframes. One containing income details, the other food & population details.
Purpose:
Load in the two source files. 
Remove unused data to minimize memory usage and improve performance.
'''
def loadFiles():
    foodDataType = { # FAO file structure
                    'Area Code': np.int16, 
                    'Area': np.str, 
                    'Item Code': np.int16, 
                    'Item': np.str, 
                    'Element Code': np.int16, 
                    'Element': np.str, 
                    'Year Code': np.int16, 
                    'Year': np.int16, 
                    'Unit': np.str, 
                    'Value': np.float32, 
                    'Flag': np.str
                   }

    # import the World Bank income dataset.
    # note the first four rows are either blank or have date information and are skipped.
    df_GDP = pd.read_csv('Data/API_NY.GDP.PCAP.CD_DS2_en_csv_v2_10181232.csv', skiprows=4)
    df_GDP = df_GDP.drop([
                        'Country Code',    # three digit country code not used in WorldBank data 
                        'Indicator Code',  # data reference - consistent for all records.
                        '1960'             # 1960 contains very bad data
                        ],axis=1)          # indicator that columnns are to be dropped

    # import the Food and Agriculture Organization dataset.
    df_Load = pd.read_csv('Data/FoodBalanceSheets_E_All_Data_(Normalized).csv', 
                          dtype=foodDataType,    # file structure 
                          nrows = 500000,
                          encoding='ISO-8859-1')
    df_Food = removeGranular(df_Load)
        
    return df_Food, df_GDP

In [5]:
'''
Procedure: UpdateValues
Inputs:
    df             pandas dataframe. no specific structure
    updateColumn   name of the column to be checked
    searchFor      string value to search for. This will be dropped from all output unless replaced.
    secondColumn   an optional condition that a second column will also be searched
    searchSecond   what value to search for in the second column 
    replaceWith    Add the specified text where the original matching text existed
    keepBefore     indicator if text before searched value should be kept
    keepAfter      indicator if text after  searched value should be kept
Outputs:
    dataframe
Purpose:
Used to get country names consistent between two datasets.
'''
def UpdateValues(df, updateColumn, searchFor, 
                 secondColumn='', secondSearch='', 
                 replaceWith='', keepBefore=True, keepAfter=True):
    if secondColumn == '':
        valueList = df[updateColumn].loc[(df[updateColumn].str.find(searchFor)>=0)].str.split(searchFor, expand=True)
    else:
        valueList = df[updateColumn].loc[((df[updateColumn].str.find(searchFor)>=0) & 
                                         (df[secondColumn] == secondSearch))].str.split(searchFor, expand=True)

    valueList = valueList.replace(np.nan, '', regex=True)
    if valueList.size >0:
        df[updateColumn].update(keepBefore * valueList[0] + replaceWith + keepAfter * valueList[1])

In [6]:
'''
Procedure: fixMappings
Inputs:
    df1            pandas dataframe - FAO
    df2            pandas dataframe - World Bank
Outputs:
    None           original dataframes are modified
Purpose:
Used to get country names consistent between two datasets. All the necessary rules
'''
def fixMappings(df1, df2):
    # first set fix multple inconsistencies
    UpdateValues(df2, 'Country Name', ',', keepAfter=False)
    UpdateValues(df2, 'Country Name', 'St.', replaceWith='Saint')
    UpdateValues(df1, 'Area', ',', keepBefore=False)
    UpdateValues(df1, 'Area', ' \(', keepAfter=False)
    UpdateValues(df1, 'Area', ' People', keepBefore=False)
    UpdateValues(df1, 'Area', ' Republic of', keepBefore=False)
    UpdateValues(df2, 'Country Name', 'PDR', keepAfter=False)

    # below are country specific updates
    UpdateValues(df2, 'Country Name', 'Czech Republic', replaceWith='Czechoslovakia') 
    UpdateValues(df2, 'Country Name', 'Kyrgyz Republic', replaceWith='Kyrgyzstan')   
    UpdateValues(df2, 'Country Name', 'United States', replaceWith='United States of America')   
    UpdateValues(df1, 'Area', '⌠', replaceWith='o')
    UpdateValues(df1, 'Area', 'Viet Nam', replaceWith='Vietnam')
    UpdateValues(df1, 'Element', " \(", keepAfter=False)
    UpdateValues(df1, 'Item', " -", keepAfter=False)

In [7]:
'''
Procedure: updateMissingData
Inputs:
    df             pandas dataframe - generic
Outputs:
    None           original dataframes are modified
Purpose:
Used to linearly estimate missing data (NaN) in a row
Where a gap exists in the data - the missing data is straight lined between the two points
Where upto three ending points are missing - a linear method is used
Gaps at the beginning of the row (or column) are not populated
'''
def updateMissingData(df, on_axis):
    df = df.interpolate(method='linear', axis=on_axis, limit=10, limit_area='inside')
    df = df.interpolate(method='linear', axis=on_axis, limit=3, limit_area='outside')
    return df

In [8]:
def __main__():
    food, incomes = loadFiles()
    fixMappings(food, incomes)
    
    # because each row contains headers we only pass the data points.
    incomes[incomes.columns[4:]] = updateMissingData(incomes[incomes.columns[4:]],1)
    
    # undo the pivot table so years are in a single column. This will make graphs easier.
    incomes = pd.melt(incomes, id_vars=['Country Name', 'Indicator Name'], var_name='Year', value_name='Value')


    food.to_csv('food.csv')
    incomes.to_csv('incomes.csv')

In [11]:
__main__()