## Collect additional LAD  data for journalism analysis.

This includes:

* Population data
* Economic activity (eg unemployment)
* Health
* Education
* Crime



### Preamble



In [None]:
%run ../notebook_preamble.ipy

### Load data

#### Population

We use the NOMIS API from [here](https://www.nomisweb.co.uk/datasets/pestsyoala)

In [None]:
lad_pop = pd.read_csv('https://www.nomisweb.co.uk/api/v01/dataset/NM_2002_1.data.csv?geography=1820327937...1820328318&date=latestMINUS1-latest&gender=0&c_age=200,209&measures=20100')

In [None]:
lad_pop.head()

In [None]:
lad_pop.columns = [x.lower() for x in lad_pop.columns]

In [None]:
lad_pop.columns

In [None]:
# We are interested in the date name, the geography name, the age name and the observed value

In [None]:
my_vars = ['date_name','geography_name','c_age_name','obs_value','geography_code']



In [None]:
distr = lad_pop[my_vars].loc[lad_pop['date_name']==2017].pivot(index='geography_name',columns='c_age_name',values='obs_value')

distr.columns = ['age_over_65','age_all']

distr['age_over_65_share'] = distr['age_over_65']/distr['age_all']

#### Also download longitudinal population data to meaure number of journalists per capita

In [None]:
lad_pop_long_url = 'https://www.nomisweb.co.uk/api/v01/dataset/NM_2002_1.data.csv?geography=1820327937...1820328318&date=latestMINUS9-latest&gender=0&c_age=200&measures=20100'

pop_long = pd.read_csv(lad_pop_long_url)

pop_long.columns = [x.lower() for x in pop_long.columns]

In [None]:
pop_long_selected = pop_long[['date_name','geography_name','geography_code','obs_value']]

pop_long_selected.to_csv(f'../../data/processed/{today_str}_lad_pop_longitudinal.csv',index_label=False)

#### Economic activity and education

We obtain this from [here]()

In [None]:
econ = pd.read_csv('https://www.nomisweb.co.uk/api/v01/dataset/NM_17_5.data.csv?geography=1946157057...1946157436&date=latestMINUS5&variable=18,45,83,111,1487,290,344&measures=20599,21001,21002,21003')

In [None]:
econ.columns = [x.lower() for x in econ.columns]

In [None]:
econ.columns

In [None]:
econ['variable_name'].value_counts()

In [None]:
econ_vars = ['date_name','geography_name','geography_code','variable_name','measures_name','obs_value']

#Focus on variable instead of the numerator / denominator / confidence interval

econ_val = econ.loc[econ['measures_name']=='Variable'][econ_vars].reset_index(drop=True)

In [None]:
econ_wide = econ_val.pivot_table(index='geography_name',columns='variable_name',values='obs_value')

In [None]:
econ_wide.columns = ['inactive_want_job_pc','inactive_pc','education_tertiary_pc',
                     'education_no_qual_pc',
                     'activity_rate_pc','employment_rate_pc','unemployment_rate_pc']

#### Health

We obtain this from [here](https://fingertips.phe.org.uk/profile/health-profiles/data#page/9/gid/1938132696/pat/6/par/E12000004/ati/101/are/E07000032)

In [None]:
health = pd.read_csv('../../data/external/indicators-DistrictUApre419.data.csv')

In [None]:
health.shape

In [None]:
health.columns = [re.sub(' ','_',x.lower()) for x in health.columns]

In [None]:
health.columns

In [None]:
health.area_type.value_counts()

In [None]:
health.indicator_name.value_counts()

There is variation in the periods for which the data is available. We will select some variables of interest and get appropriate years for them

In [None]:
vars_of_interest = [
    #'Life expectancy at birth',
    'Under 75 mortality rate: all causes',
    'Suicide rate',
    #'Inequality in life expectancy at birth',
    'Infant mortality',
    'Violent crime (including sexual violence) - violence offences per 1,000 population',
    'Average Attainment 8 score',
    'Deprivation score (IMD 2015)',
    'Statutory homelessness - Eligible homeless people not in priority need']

In [None]:
years_of_interest = [
    #'2015 - 17',
    '2015 - 17',
    '2015 - 17',
    #'2015 - 17',
    '2015 - 17','2016/17','2015/16','2015','2016/17']

In [None]:
health_container = []

for n,v in enumerate(vars_of_interest):
    
    out = health.loc[(health['indicator_name']==v)&(health['time_period']==years_of_interest[n])
          & (health['area_type']=='District & UA (pre 4/19)') & ((health['sex']=='Persons'))].set_index('area_name')
    
    out_rel = out['value']
    
    out_rel.name = v
    
    health_container.append(out_rel)
    #health_container.append(out[['indicator_name','value']])
        
    
    
    
    
    


In [None]:
clean = {'Bournemouth':,
         'Christchurch':, 
         'East Dorset':, 
         'Forest Heath:', 
         'North Dorset':, 
         'Poole':, 
         'Purbeck':, 
         'St Edmundsbury':, 
         'Suffolk Coastal':, 
         'Taunton Deane':, 
         'Waveney':, 
         'West Dorset':, 
         'West Somerset':, 
         'Weymouth and Portland': 
}

In [None]:
health_df = pd.concat(health_container,axis=1)


clean_health_lad_lookup = {'Bristol':'Bristol, City of',
             'Folkestone & Hythe':'Folkestone and Hythe',
              'Herefordshire':'Herefordshire, County of',
              'Kingston upon Hull':'Kingston upon Hull, City of',
              'St. Edmundsbury':'St Edmundsbury'
                           
                           
                           
                           
             }
health_df.index = [clean_health_lad_lookup[x] if x in clean_health_lad_lookup.keys() else x for x in health_df.index]

In [None]:
health_df.columns = ['mortality_rate','suicide_rate','infant_mortality','violent_crime_per_1000','average_atainment','deprivation_score','statutory_homelessness']

### Brexit data

Accessed from [here](https://www.electoralcommission.org.uk/who-we-are-and-what-we-do/elections-and-referendums/past-elections-and-referendums/eu-referendum/results-and-turnout-eu-referendum)

In [None]:
brex = pd.read_csv('https://www.electoralcommission.org.uk/sites/default/files/2019-07/EU-referendum-result-data.csv')

In [None]:
brex.columns = [re.sub(' ','_',x.lower()) for x in brex.columns]

In [None]:
brex['leave_share'] = brex['leave']/brex['votes_cast']

In [None]:
brex = brex[['area','leave_share']].set_index('area')

### Output

Check potential issues with indices

In [None]:
from itertools import combinations,permutations

In [None]:
def missing_indices(dict_of_dfs):
    
    '''
    
    Returns disjoint indices between the dfs. Useful when merging
    
    
    '''
    
    combs = list(combinations(dict_of_dfs.keys(),2))
    
    for c in combs:
        
        print(f'{c[0]} and {c[1]}')
        print('====')

        print('\n')
        
        print(f'Disjoint {c[0]}-{c[1]}')
        print('---')
        disj = set(dict_of_dfs[c[0]].index)-set(dict_of_dfs[c[1]].index)
        print(sorted(list(disj)))
        
        print('\n')
        
        print(f'Disjoint {c[1]}-{c[0]}')
        print('---')
        
        disj = set(dict_of_dfs[c[1]].index)-set(dict_of_dfs[c[0]].index)
        print(sorted(list(disj)))
        
        print('\n')
        
        
            
        
    

In [None]:
dict_of_dfs = {'pop':distr,'econ':econ_wide,'health':health_df,'brex':brex}

In [None]:
missing_indices(dict_of_dfs)

Some of the gaps here seem to be due to changes in boundaries of LADs eg check Bournemouth & Poole

In [None]:
output = pd.concat([distr,econ_wide,health_df,brex],axis=1)

output

In [None]:
output.to_csv(f'../../data/processed/{today_str}_secondary_data.csv')

In [None]:
import seaborn as sns

In [None]:
sns.clustermap(output.corr(),cmap='bwr')