In [1]:
import pandas as pd

In [2]:
!wget -nc "http://cap-payments.defra.gov.uk/Download/2014_All_CAP_Search_Results_Data_P14.xls"

File '2014_All_CAP_Search_Results_Data_P14.xls' already there; not retrieving.



In [3]:
excel_file = pd.ExcelFile('2014_All_CAP_Search_Results_Data_P14.xls')

In [4]:
id_vars = ['Year', 'BeneficiaryCode', 'BeneficiaryName_F201',
       'PostcodePrefix_F202B', 'TownCity_F202C',
#        'OtherEAGFTotal', 'DirectEAGFTotal', 'RuralDevelopmentTotal', 'Total',
        'PayingAgencyLink']


def get_melted_df(excel_file, sheet_name):
    df_raw = excel_file.parse(sheet_name)
    df_raw = df_raw[df_raw['Year'].notnull()]
    all_columns = list(df_raw.columns)
    value_vars = list(set(all_columns) - set(id_vars))
    df_raw = df_raw.rename(columns={x: x.strip() for x in value_vars})
    value_vars = [v.strip() for v in value_vars]
    df = pd.melt(df_raw, id_vars=id_vars, value_vars=value_vars, var_name='scheme', value_name='amount')
    df = df[df['amount'].notnull()]
    return df

df = pd.concat([get_melted_df(excel_file, sheet) for sheet in excel_file.sheet_names])
df.head()

Unnamed: 0,Year,BeneficiaryCode,BeneficiaryName_F201,PostcodePrefix_F202B,TownCity_F202C,PayingAgencyLink,scheme,amount
415,2014.0,1007209.0,*******,BT46,MAGHERA,DARDNI,Additional amounts of aid,43.59
902,2014.0,1181623.0,*******,BT37,NEWTOWNABBEY,DARDNI,Additional amounts of aid,36.79
1316,2014.0,1203672.0,*******,BT80,COOKSTOWN,DARDNI,Additional amounts of aid,23.96
2786,2014.0,1002313.0,*******,BT41,ANTRIM,DARDNI,Additional amounts of aid,16.65
4266,2014.0,1289780.0,*******,BT34,NEWRY,DARDNI,Additional amounts of aid,14.15


In [5]:
df = df.rename(columns={
    'Year': 'year',
    'BeneficiaryCode': 'recipient_id',
    'BeneficiaryName_F201': 'recipient_name',
    'PostcodePrefix_F202B': 'recipient_postcode',
    'TownCity_F202C': 'recipient_location',
    'PayingAgencyLink': 'agency',
})
df['country'] = 'GB'
df['currency'] = 'GBP'
df['year'] = df['year'].astype('int')
df['recipient_name'] = df['recipient_name'].str.strip()
df['recipient_postcode'] = df['recipient_postcode'].str.strip()
df['recipient_location'] = df['recipient_location'].str.strip()
df.set_value(df['recipient_id'].notnull(), 'recipient_name', None)
df.head()

Unnamed: 0,year,recipient_id,recipient_name,recipient_postcode,recipient_location,agency,scheme,amount,country,currency
415,2014,1007209.0,,BT46,MAGHERA,DARDNI,Additional amounts of aid,43.59,GB,GBP
902,2014,1181623.0,,BT37,NEWTOWNABBEY,DARDNI,Additional amounts of aid,36.79,GB,GBP
1316,2014,1203672.0,,BT80,COOKSTOWN,DARDNI,Additional amounts of aid,23.96,GB,GBP
2786,2014,1002313.0,,BT41,ANTRIM,DARDNI,Additional amounts of aid,16.65,GB,GBP
4266,2014,1289780.0,,BT34,NEWRY,DARDNI,Additional amounts of aid,14.15,GB,GBP


In [6]:
def set_recipient_id(row):
    if pd.isnull(row['recipient_id']):
        row['recipient_id'] = 'GB-%s-%s' % (row['recipient_postcode'], row['recipient_name'])
    else:
        row['recipient_id'] = 'GB-%s' % int(row['recipient_id'])
    return row

df = df.apply(set_recipient_id, axis=1)
df.head()

Unnamed: 0,year,recipient_id,recipient_name,recipient_postcode,recipient_location,agency,scheme,amount,country,currency
415,2014,GB-1007209,,BT46,MAGHERA,DARDNI,Additional amounts of aid,43.59,GB,GBP
902,2014,GB-1181623,,BT37,NEWTOWNABBEY,DARDNI,Additional amounts of aid,36.79,GB,GBP
1316,2014,GB-1203672,,BT80,COOKSTOWN,DARDNI,Additional amounts of aid,23.96,GB,GBP
2786,2014,GB-1002313,,BT41,ANTRIM,DARDNI,Additional amounts of aid,16.65,GB,GBP
4266,2014,GB-1289780,,BT34,NEWRY,DARDNI,Additional amounts of aid,14.15,GB,GBP


In [7]:
len(df)

1085257

In [8]:
df['recipient_id'].value_counts()

GB-BT24-THE NATIONAL TRUST                       54
GB-BT79-MR MICHAEL MCCULLAGH                     33
GB-HS6-Mr A Macdonald                            30
GB-BT60-MR PATRICK HUGHES                        25
GB-BT35-MR MICHAEL MCPARLAND                     25
GB-HS2-DONALD MACLEOD                            25
GB-BT93-MR PATRICK MAGUIRE                       24
GB-BT79-MR PETER MCCULLAGH                       24
GB-BT79-MR PATRICK MCCULLAGH                     24
GB-LL53-MESSRS WILLIAMS                          24
GB-BT70-MR BRIAN QUINN                           22
GB-NP26-MONMOUTHSHIRE COUNTY COUNCIL             21
GB-BT92-MR THOMAS MAGUIRE                        21
GB-BT79-MR MARTIN KELLY                          20
GB-BT70-MR PATRICK DONNELLY                      20
GB-BT60-MR MICHAEL HUGHES                        19
GB-BT60-MR JOHN HUGHES                           19
GB-NE61-NORTHUMBERLAND COUNTY COUNCIL            19
GB-BT78-MR JOSEPH TEAGUE                         18
GB-BT79-MR P

In [9]:
df.to_csv('gb_2014.csv', index=False, encoding='utf-8')