#  Label arXiv with LAD data

We will use this in our subnational analysis of DL research

In [None]:
%matplotlib inline

In [None]:
%run notebook_preamble.ipy

In [None]:
import geopandas as gp

from shapely.geometry import Point

## Load data

In [None]:
grid = pd.read_csv('../data/external/grid/grid.csv')
grid_lat_lon = pd.read_csv('../data/external/grid/full_tables/addresses.csv')

### This is to fix a bug in the fuzzy matching - we matched UCL papers with UCL Australia instead of UK

In [None]:
ucl = grid.loc[grid['Name']=='University College London'].reset_index(drop=True)

ucl.columns = ['institute_id','institute_name','institute_city','institute_state','institute_country']

In [None]:
ucl_address = grid_lat_lon.set_index('grid_id').loc[ucl['institute_id'][0]]
ucl['institute_lat'] = ucl_address['lat']
ucl['institute_lon'] = ucl_address['lng']


In [None]:
grid_matched = pd.read_csv('../data/external/1_8_2019_paper_institute_locations.csv',compression='zip',dtype={'article_id':str})

In [None]:
grid_matched_w_data = grid_matched.dropna(axis=0,subset=['institute_lat','institute_lon'])

In [None]:
ucl_papers = grid_matched_w_data.loc[grid_matched_w_data['institute_name']=='UCL Australia']

ucl_papers_without_geo = ucl_papers.iloc[:,:4]
ucl_papers_without_geo['institute_name']='University College London'

ucl_papers_geo = pd.merge(ucl_papers_without_geo,ucl,left_on='institute_name',right_on='institute_name')

In [None]:
grid_matched_clean = pd.concat([grid_matched.loc[grid_matched['institute_name']!='UCL Australia'].reset_index(drop=True),
                                ucl_papers_geo],axis=0)

grid_matched_clean.head()

In [None]:
lads = gp.read_file('../data/external/lad_shape/Local_Authority_Districts_December_2018_Boundaries_GB_BFC.shp')

In [None]:
#grid_matched_clean.to_csv(f'../data/external/{today_str}_papers_institution_ucl_cleaned.csv',compression='zip')

In [None]:
admin_shapes = gp.read_file('../data/external/admin/ne_10m_admin_1_states_provinces.shp')

## Merge (all countries)

In [None]:
#Create a lookup between cities and IDs
# We crate the city_country variable to avoid mistakes with duplicate city names (Cambridge, England vs Cambridge Mass)
grid_matched_clean['city_country'] = [x+'_'+y if pd.isnull(x)==False else np.nan for 
                                      x,y in zip(grid_matched_clean['institute_city'],grid_matched_clean['institute_country'])]

city_coords = grid_matched_clean.dropna(axis=0,subset=['institute_city','institute_country','institute_lat','institute_lon']).drop_duplicates(
    ['city_country','institute_country'])[['city_country','institute_country','institute_lat','institute_lon']].reset_index(drop=False)

In [None]:
# #This is to create the geodf
# grid_df = gp.GeoDataFrame(
#     grid_matched_clean, geometry=[Point(x, y) for x, y in zip(grid_matched_clean['institute_lon'], grid_matched_clean['institute_lat'])])


city_df = gp.GeoDataFrame(
    city_coords, geometry=[Point(x, y) for x, y in zip(city_coords['institute_lon'], city_coords['institute_lat'])])

In [None]:
#Reproject
lads = lads.to_crs({'init':'epsg:4326'})
admin_shapes = admin_shapes.to_crs({'init':'epsg:4326'})

In [None]:
#sp_joined = gp.sjoin(lads,grid_df,how='left',op='contains')

sp_joined = gp.sjoin(city_df,admin_shapes,how='left',op='within')

In [None]:
sp_joined[['city_country','name_en']].head()

In [None]:
papers_geo = pd.merge(grid_matched_clean,sp_joined[['city_country','name_en','geometry','iso_a2']],left_on='city_country',right_on='city_country')
papers_geo.to_csv(f'../data/processed/{today_str}_grid_geo_admin_all.csv',compression='zip')

### LAD merge

In [None]:
#Reproject
lads = lads.to_crs({'init':'epsg:4326'})

In [None]:
# #This is to create the geodf
grid_df = gp.GeoDataFrame(
    grid_matched_clean, geometry=[Point(x, y) for x, y in zip(grid_matched_clean['institute_lon'], grid_matched_clean['institute_lat'])])

In [None]:
grid_df = grid_df.loc[grid_df['is_multinational']==0]

In [None]:
sp_joined = gp.sjoin(lads,grid_df,how='left',op='contains')

In [None]:
#papers_geo.to_csv(f'../data/processed/{today_str}_grid_geo_admin_all.csv',compression='zip')

In [None]:
papers_grouped = pd.concat([sp_joined.groupby('article_id')[var].apply(lambda x: list(x)) for var in ['lad18cd','lad18nm']],axis=1)

In [None]:
papers_grouped.to_json(f'../data/processed/{today_str}_arxiv_lads.json',orient='index')

In [None]:
missing_ids = set((grid_df.loc[grid_df['institute_country']=='United Kingdom'])['article_id'])-set(sp_joined['article_id'])

In [None]:
grid_matched_clean.loc[grid_matched_clean['article_id']=='0905.0201'][['institute_name','institute_country']]