In [41]:
# Modelling

%run ../notebook_preamble.ipy

import pandas as pd
import numpy as np
import altair as alt
import sg_covid_impact
from sg_covid_impact.secondary_data import read_secondary
from sg_covid_impact.descriptive import (
    read_official,
    read_claimant_counts,claimant_count_norm,read_search_trends,make_exposure_shares,
                                         search_trend_norm,make_high_exposure,rank_sector_exposures,
                                        make_exposure_shares_detailed)
from sg_covid_impact.make_sic_division import extract_sic_code_description,load_sic_taxonomy
from sg_covid_impact.diversification import (
    load_predicted,extract_sectors,extract_network,make_diversification_options,make_sector_space_base)

project_dir = sg_covid_impact.project_dir


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [102]:
def make_exposure_share_variable(exposure_thres=7):

    logging.info("Calculating sector exposure")
    exposures_ranked = make_sector_exposure_table()  

    logging.info("Calculating local exposure shares")
    exposure_lad_codes = make_local_exposure_table(exposures_ranked)
    
    logging.info(f"Calculating high exposure shares level {exposure_thres}")
    exposure_high = (make_high_exposure(exposure_lad_codes,geo='geo_cd',
                                       level=exposure_thres)
                     .assign(variable='exposure_share')
                     .rename(columns={'share':'value'}))
    
    return exposure_high

def make_sector_exposure_table():
    
    d = read_search_trends()
    trends_normalised = search_trend_norm(d)
    exposures_ranked = rank_sector_exposures(trends_normalised,'division')
    
    return exposures_ranked

def make_local_exposure_table(exposures_ranked):
    
    bres = read_official()
    exposure_levels = exposures_ranked.merge(bres,left_on='division',right_on='division')
    exposure_lad_codes = make_exposure_shares(exposure_levels,'geo_cd')
    return exposure_lad_codes


def make_div_share_variable(exposure_level = 7, div_level = 3):
    
    _DIVISION_NAME_LOOKUP = extract_sic_code_description(load_sic_taxonomy(), "Division")
    
    logging.info("Calculating sector exposure")
    exposures_ranked = make_sector_exposure_table()
    division_month_exposure_dict = exposures_ranked.set_index(['division','month'])['rank'].to_dict()

    logging.info("Making sector space")
    my_divisions = list(set(exposures_ranked['division']))
    pr = load_predicted()
    pr_selected = pr[my_divisions]
    t = extract_sectors(pr_selected,0.5)
    
    div_space = extract_network(t)
    p,g,l = make_sector_space_base(sector_space=div_space,extra_edges=70)
    
    
    logging.info("Calculating local exposure shares")
    bres = read_official()
    exposure_levels = exposures_ranked.merge(bres,left_on='division',right_on='division')
    exposure_levels['division_name'] = exposure_levels['division'].map(_DIVISION_NAME_LOOKUP)
    exposure_shares_detailed = make_exposure_shares_detailed(exposure_levels,geo='geo_cd')
    
    logging.info("Calculating diversification share rankings")
    monthly_diversification_rankings = pd.concat([
        (make_diversification_options(
            g,division_month_exposure_dict,m,range(exposure_level,10),[0,1,2,3])
     .sort_values('mean',ascending=False)
     .assign(divers_ranking = lambda x: pd.qcut(-x['mean'],q=np.arange(0,1.1,0.25),labels=False))
     .assign(month=m)) for m in range(3,11)])
    
    # Merge with diversification information
    logging.info(f"Calculating diversification shares level {str(div_level)}")
    diversification_lad_detailed = (exposure_levels
                                    .merge(monthly_diversification_rankings,
                                      left_on=['division','month'],
                                      right_on=['division','month'],
                                      how='outer'))

    diversification_lad_detailed['divers_ranking'] = (diversification_lad_detailed
                                                      ['divers_ranking']
                                                      .fillna('Less exposed'))
    
    diversification_shares = (make_exposure_shares(
        diversification_lad_detailed,geography='geo_cd',variable='divers_ranking')
                              .query(f"divers_ranking == {div_level}")
                              .assign(variable='low_diversification_share')
                              .drop(axis=1,labels=['value'])
                              .rename(columns={'share':'value'})[
                                  ['month','geo_cd','variable','value']]).reset_index(drop=True)
    
    return diversification_shares
    
def make_claimant_count_variable():
    '''
    '''
    cl = read_claimant_counts()
    cl_count = (cl
            .query("measure_name=='Claimants as a proportion of residents aged 16-64'")
            [['geography_code','month','obs_value']]
            .assign(variable='cl_count')
            .rename(columns={'obs_value':'value',
                            'geography_code':'geo_cd'}))
    
    cl_norm_ = claimant_count_norm(cl)
    
    cl_norm = (cl_norm_[['geography_code','month','cl_norm']]
           .rename(columns={'geography_code':'geo_cd',
                           'cl_norm':'value'})
           .assign(variable='cl_count_norm'))
    
    return pd.concat([cl_count,cl_norm])
    
def make_secondary_variables():
    
    secondary = read_secondary()
    
    secondary_out = (secondary
                     .rename(columns={'geography_code':'geo_cd'})
                     [['geo_cd','variable','value']])
    
    return secondary_out
    

In [106]:
exp = make_exposure_share_variable()

2021-01-10 13:14:14,918 - root - INFO - Calculating sector exposure
2021-01-10 13:14:17,918 - root - INFO - Calculating local exposure shares
2021-01-10 13:14:22,832 - root - INFO - Calculating high exposure shares level 7


In [69]:
div = make_div_share_variable()

2021-01-10 12:59:49,456 - root - INFO - Calculating sector exposure
2021-01-10 12:59:52,245 - root - INFO - Making sector space
2021-01-10 13:00:15,949 - root - INFO - Calculating local exposure shares
2021-01-10 13:00:37,971 - root - INFO - Calculating diversification share rankings
2021-01-10 13:00:38,053 - root - INFO - Calculating diversification shares level 3


In [94]:
cl = make_claimant_count_variable()

In [103]:
secondary = make_secondary_variables()

In [234]:
short_secondary_titles = {'% with NVQ4+ - aged 16-64':'% tertiary',
                          '% with no qualifications (NVQ) - aged 16-64':'% no qual',
                          '% with other qualifications (NVQ) - aged 16-64':'% other qual',
                          'Annual pay - gross':'Gross annual pay',
                          'Economic activity rate - aged 16-64':'Econ Activity rate',
                          'Employment rate - aged 16-64': 'Emp rate',
                          'cl_count':'Claimant count',
                          'cl_count_norm':'Claimant (normalised)',
                          'smd_high_deprivation_share':'SMDI'}

sort_vars = ['Claimant count','Claimant (normalised)','Econ activity rate','Emp rate','Gross annual pay',
             '% tertiary','% no qual']

In [239]:
### Visualise relationships between variables

web_vars = [exp,div]
other_vars = [cl,secondary]

out = []

for w in web_vars:
    for o in other_vars:
        if 'month' in o.columns:
            for v in set(o['variable']):
                for m in set(w['month']):
                    my_w = w.query(f"month=={m}")
                    my_o = (o
                            .query(f"month=={m}")
                            .query(f"variable == '{v}'"))
                    merg = my_w.merge(my_o,on='geo_cd')
                    my_v_name = list(set(w['variable']))[0]
                    corr = np.float(merg[['value_x','value_y']].corr().iloc[0,1])
                    res = pd.Series([my_v_name,v,m,corr],index=['primary','secondary','month','corr'])
                    out.append(res)
        else:
            for v in set(o['variable']):
                for m in set(w['month']):
                    my_w = w.query(f"month=={m}")
                    my_o = (o
                            .query(f"variable == '{v}'"))
                    merg = my_w.merge(my_o,on='geo_cd')
                    my_v_name = list(set(w['variable']))[0]
                    corr = np.float(merg[['value_x','value_y']].corr().iloc[0,1])
                    res = pd.Series([my_v_name,v,m,corr],index=['primary','secondary','month','corr'])
                    out.append(res)
                    
correlation_df = pd.DataFrame(out)  

correlation_df = (correlation_df
                  .assign(secondary_short = lambda x: x['secondary'].map(short_secondary_titles)))
correlation_df= correlation_df.loc[~correlation_df['secondary_short'].isin(['% other qual','SMDI',
                                                                           'Econ Activity rate'])]

In [287]:
ch = (alt.Chart(correlation_df)
      .mark_point(filled=True)
      .encode(
      x='month',
      y='corr',
      facet=alt.Facet('secondary_short',columns=2,
                  sort=sort_vars,
                  header=alt.Header(labelAngle=0,labelAnchor='start',
                                                     labelOrient='top')),
      color='primary',
      shape='primary')).properties(width=300,height=50)

In [288]:
ch