# LAD geocoding

Here we geocode GtR organisations at the LAD level

## 0. Preamble

In [None]:
%run notebook_preamble.ipy

In [None]:
def flatten_list(a_list):
    return([x for el in a_list for x in el])


### Imports

In [None]:
#For geocoding into lads
import geopandas as gp

from shapely.geometry import Point

### Lad shapefile

In [None]:
lad_shape = gp.read_file(
    '/Users/jmateosgarcia/links/sc/data/aux/lad_shape/Local_Authority_Districts_December_2017_Full_Clipped_Boundaries_in_Great_Britain.shp')

### Change the projection

In [None]:
lad_shape.to_crs(epsg=4326,inplace=True)

### Lookups

In [None]:
scottish_lads = {'Aberdeen City',
 'Aberdeenshire',
 'Angus',
 'Argyll and Bute',
 'Clackmannanshire',
 'Dumfries and Galloway',
 'Dundee City',
 'East Ayrshire',
 'East Dunbartonshire',
 'East Lothian',
 'East Renfrewshire',
 'Edinburgh, City of',
 'Eilean Siar',
 'Falkirk',
 'Fife',
 'Glasgow City',
 'Highland',
 'Inverclyde',
 'Midlothian',
 'Moray',
 'North Ayrshire',
 'North Lanarkshire',
 'Orkney Islands',
 'Perth and Kinross',
 'Renfrewshire',
 'Scottish Borders',
 'Shetland Islands',
 'South Ayrshire',
 'South Lanarkshire',
 'Stirling',
 'West Dunbartonshire',
 'West Lothian'}

### Some renaming

In [None]:
rename_lads = {'E07000146': 'King`s Lynn and West Norfolk',
 'E07000112': 'Shepway',
 'W06000001': 'Anglesey',
 'W06000014': 'The Vale of Glamorgan',
 'W06000016': 'Rhondda, Cynon, Taff',
 'S12000036': 'Edinburgh, City of',
 'S12000013': 'Eilean Siar',
 '95AA': 'Antrim',
 '95BB': 'Ards',
 '95CC': 'Armagh',
 '95DD': 'Ballymena',
 '95EE': 'Ballymoney',
 '95FF': 'Banbridge',
 '95GG': 'Belfast',
 '95HH': 'Carrickfergus',
 '95II': 'Castlereagh',
 '95JJ': 'Coleraine',
 '95KK': 'Cookstown',
 '95LL': 'Craigavon',
 '95MM': 'Derry',
 '95NN': 'Down',
 '95OO': 'Dungannon',
 '95PP': 'Fermanagh',
 '95QQ': 'Larne',
 '95RR': 'Limavady',
 '95SS': 'Lisburn',
 '95TT': 'Magherafelt',
 '95UU': 'Moyle',
 '95VV': 'Newry and Mourne',
 '95WW': 'Newtownabbey',
 '95XX': 'North Down',
 '95YY': 'Omagh',
 '95ZZ': 'Strabane'}

In [None]:
lad_shape['lad_name'] = [rename_lads[x] if x in rename_lads.keys() else name for x,name in
                        zip(lad_shape['lad17cd'],lad_shape['lad17nm'])]

## 1. Load files and spatial join

In [None]:
gtr_dir = '../data/raw/gtr/2019-05-02/'

orgs, orgs_locs = [pd.read_csv(gtr_dir+name) for name in ['/gtr_organisations.csv','/gtr_organisations_locations.csv']]

In [None]:
orgs_locs['coordinates'] = orgs_locs[['longitude','latitude']].apply(Point,axis=1)

In [None]:
org_locs = gp.GeoDataFrame(orgs_locs,geometry='coordinates')

Spatial join (point in polygon)

In [None]:
lad_gtr = gp.sjoin(org_locs,lad_shape,op='within')

In [None]:
lad_gtr.lad_name.value_counts().head()

In [None]:
len(lad_gtr)

In [None]:
len(org_locs)

In [None]:
matched_ids = set(lad_gtr['id'])

orgs_locs.loc[[x not in matched_ids for x in org_locs['id']]]['country_name'].value_counts().head()

Most of the unmatched orgs have missing geographical information

In [None]:
#Create an org id - lad lookup
org_lad_lookup = {x['id']:[x['lad17cd'],x['lad_name']] for n,x in lad_gtr.iterrows()}

## Create dfs for matching

I want a df where every row is a project. The columns represent:

* The LAD of the lead organisation
* The LADs of the participant organisations
* Flags for whether the lead and participating organisations are Scottish or not

In [None]:
link = pd.read_csv('../data/raw/gtr/2019-05-02/gtr_link_table.csv')

In [None]:
org_link = link.loc[['_ORG' in x for x in link['rel']]].reset_index(drop=False)

In [None]:
org_link.columns

In [None]:
org_link_grouped = org_link.groupby(['project_id','rel'])['id'].apply(lambda x: list(set(x))).reset_index(drop=False)

In [None]:
org_link_grouped_wide = pd.pivot_table(org_link_grouped,index='project_id',columns='rel',values='id',aggfunc=lambda x: list(x)[0])

#### Run the lookup - we need some nested loops to deal with missing values and missing orgs

In [None]:
def lad_allocator(var_name,df):
    '''
    Looks up the lad code and name of organisations participating in a project
    '''
    
    df[f'{var_name.lower()}_lad_code'],df[f'{var_name.lower()}_lad_name'] = [
    [[] if type(x)==float else [org_lad_lookup[el][n] for el in x if el in org_lad_lookup.keys()] for x in df[f'{var_name}_ORG']] for
    n in [0,1]]
    
    return(df)
    
    

Each of these returns a geolabelled dataset

In [None]:
org_geo = lad_allocator('LEAD',org_link_grouped_wide)
org_geo = lad_allocator('PARTICIPANT',org_link_grouped_wide)
org_geo = lad_allocator('PP',org_link_grouped_wide)
org_geo = lad_allocator('COLLAB',org_link_grouped_wide)
org_geo = lad_allocator('FELLOW',org_link_grouped_wide)

We group all the organisation geo data, and all the involved (all except the lead) in two lists

In [None]:
org_geo['all_lad_code'],org_geo['all_lad_name'] = [[flatten_list([row[name+f'_lad_{var}'] for name in ['lead','participant','pp','collab','fellow']]) for
                                                             n, row in org_geo.iterrows()] for var in ['code','name']]


org_geo['involved_lad_code'],org_geo['involved_lad_name'] = [[flatten_list([row[name+f'_lad_{var}'] for name in ['participant','pp','collab','fellow']]) for
                                                             n, row in org_geo.iterrows()] for var in ['code','name']]

In [None]:
org_geo.to_csv(f'../data/temp_scotland/{today_str}_gtr_org_lad_labelled.csv',compression='zip')

## Merge with the combined df

In [None]:
df = pd.read_csv('../data/processed/9_5_2019_combined_gtr_projects.csv',compression='zip')
df = df[[x for x in df.columns if 'Unnamed' not in x]]

In [None]:
df_w_geo = pd.merge(df,org_geo,left_on='project_id',right_on='project_id')

In [None]:
len(df)-len(df_w_geo)

In [None]:
matched_ids = set(df_w_geo.project_id)

unmatched = df.loc[[x not in matched_ids for x in df['project_id']]]

In [None]:
unmatched_ids = set(unmatched['project_id'])

In [None]:
link.loc[[x in unmatched_ids for x in link['project_id']]]['rel'].value_counts()

No organisation data for the unmatched ones! 

**Todo** check with Joel and Russ about this

In [None]:
#df_w_geo_w_lead = df_w_geo.dropna(axis=0,subset=['lead_lad_name'])

In [None]:
df_w_geo['lead_scot'],df_w_geo['inv_scot'] = [[any(el in scottish_lads for el in x) for x in df_w_geo[var]] for var in ['lead_lad_name','involved_lad_name']]

In [None]:
df_w_geo['inv_scot_n'] = [np.sum([el in scottish_lads for el in x]) for x in df_w_geo['all_lad_name']]

In [None]:
df_w_geo.to_csv(f'../data/temp_scotland/{today_str}_gtr_projects_geo_labelled.csv',compression='zip')

In [None]:
pd.Series(flatten_list(df_w_geo['lead_lad_name'])).value_counts()

In [None]:
pd.merge(org_link,lad_gtr,left_on='id',right_on='id')['lad17nm'].value_counts()

In [None]:
la = pd.merge(orgs,lad_gtr,left_on='id',right_on='id')

In [None]:
cam = set(la.loc[la.name=='University of Cambridge']['id'])

In [None]:
la2 = link.loc[[x in cam for x in link['id']]]

In [None]:
cam

In [None]:
cam2 = la.loc[la.name=='Cambridge University']['id']

In [None]:
for c in sorted(la.loc[la.lad17nm=='Cambridge']['name']):
    print(c)
    print('\n')