# Explore migration scaling, parameter uplift

In [None]:
import glob
import os
import re

import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import simim.data_apis

from ukpopulation.myedata import MYEData
from ukpopulation.nppdata import NPPData
from ukpopulation.snppdata import SNPPData

In [None]:
plt.rcParams["figure.figsize"] = (10,10)

In [None]:
paths = [os.path.split(p.replace('../simim/data/output\\', '')) for p in glob.glob('../simim/data/output/**/*.csv')]
paths[0]

In [None]:
lads = gpd.read_file('../simim/data/cache/Local_Authority_Districts_December_2016_Ultra_Generalised_Clipped_Boundaries_in_Great_Britain.shp') \
    .drop(['objectid', 'lad16nmw', 'bng_e', 'bng_n', 'long', 'lat', 'st_lengths'], axis=1)
lads.head(1)

In [None]:
arc_lads = pd.read_csv('../simim/data/scenarios/camkox_lads.csv')
arc_lads.head(1)

In [None]:
def read_baseline(arc_lads):
    # households
    os.chdir("../simim")
    simim_data = simim.data_apis.Instance({
        "coverage": "GB", 
        "cache_dir": "./data/cache", 
        "output_dir": "./data/output",
        "model_type": "none",
        "base_projection": "ppp",
        "scenario": "none",
        "attractors": []
    })
    dfs = []
    for year in range(2015, 2051):
        df = simim_data.get_households(year, lads.lad16cd.unique())
        dfs.append(df)
    households = pd.concat(dfs, sort=False).rename(columns={"PROJECTED_YEAR_NAME": "YEAR"})
    
    
    # population
    lad_cds = list(arc_lads.geo_code.unique())
    mye = MYEData()
    years = [2015]
    pop_mye = mye.aggregate(["GENDER", "C_AGE"], lad_cds, years)

    npp = NPPData()
    snpp = SNPPData()
    snpp_years = [2030]
    extra_years = [2050]
    pop_snpp = snpp.aggregate(["GENDER", "C_AGE"], lad_cds, snpp_years)
    pop_ex = snpp.extrapolagg(["GENDER", "C_AGE"], npp, lad_cds, extra_years)
    pop = pd.concat([pop_mye, pop_snpp, pop_ex], axis=0) \
        .rename(columns={'OBS_VALUE':'PEOPLE', 'PROJECTED_YEAR_NAME': 'YEAR'})
    pop.PEOPLE = pop.PEOPLE.astype(int)
    # merge later (after subset everything else)
    
    os.chdir("../notebooks")
    
    # employment, gva, dwellings
    df_emp = pd.read_csv("../simim/data/arc/arc_employment__baseline.csv")
    df_gva = pd.read_csv("../simim/data/arc/arc_gva__baseline.csv")
    df_dwl = pd.read_csv("../simim/data/arc/arc_dwellings__baseline.csv")

    # merge to single dataframe
    df = df_gva \
      .merge(df_emp, on=["timestep", "lad_uk_2016"], how="left") \
      .merge(df_dwl, on=["timestep", "lad_uk_2016"], how="left")

    baseline = df.reset_index().rename(columns={
        "timestep": "YEAR", 
        "lad_uk_2016": "GEOGRAPHY_CODE", 
        "employment": "JOBS", 
        "gva": "GVA", 
        "gva_per_sector": "GVA",
        "dwellings": "DWELLINGS"
    })[[
     "YEAR", "GEOGRAPHY_CODE", "JOBS", "GVA", "DWELLINGS"
    ]].merge(
      households, on=["GEOGRAPHY_CODE", "YEAR"]
    )
    baseline["GVA"] = baseline["GVA"].round(6)
    # convert from 1000s jobs to jobs
    baseline["JOBS"] = (baseline["JOBS"] * 1000).round().astype(int)
    
    baseline = baseline[
        baseline.GEOGRAPHY_CODE.isin(arc_lads.geo_code)        
        & baseline.YEAR.isin([2015, 2030, 2050])
    ]
    baseline = baseline \
        .merge(pop, on=['GEOGRAPHY_CODE','YEAR']) \
        .merge(arc_lads, left_on='GEOGRAPHY_CODE', right_on='geo_code') \
        .drop(['geo_code'], axis=1) \
        .rename(columns={'geo_label':'GEOGRAPHY_NAME'})
    
    baseline['SCENARIO'] = 'baseline'
    
    return baseline

baseline = read_baseline(arc_lads)

len(baseline.YEAR.unique()), len(baseline.GEOGRAPHY_CODE.unique()), len(baseline)

In [None]:
baseline.tail()

In [None]:
def read_output_and_scenario(arc_lads, baseline, scenario_key, output_path):
    key = scenario_key
    
    if key == "3-new-cities23":
        econ_key = "1-new-cities"
    elif key == "4-expansion23":
        econ_key = "2-expansion"
    else:
        econ_key = key
        
    df_gva = pd.read_csv("../simim/data/arc/arc_gva__{}.csv".format(econ_key))
    df_emp = pd.read_csv("../simim/data/arc/arc_employment__{}.csv".format(econ_key))
    df_dwl = pd.read_csv("../simim/data/arc/arc_dwellings__{}.csv".format(key))
    
    # merge to single dataframe
    scenario = df_gva \
    .merge(df_emp, on=["timestep", "lad_uk_2016"], how="left") \
    .merge(df_dwl, on=["timestep", "lad_uk_2016"], how="left") \
    .drop("lad16nm", axis=1) \
    .rename(columns={
        "timestep": "YEAR", 
        "lad_uk_2016": "GEOGRAPHY_CODE", 
        "gva_per_sector": "GVA",
        "employment": "JOBS",  
        "dwellings": "HOUSEHOLDS"})
    
    scenario = scenario.merge(arc_lads, left_on='GEOGRAPHY_CODE', right_on='geo_code') \
        .drop(['geo_code'], axis=1) \
        .rename(columns={'geo_label':'GEOGRAPHY_NAME'})
    
    scenario = scenario[
        scenario.GEOGRAPHY_CODE.isin(arc_lads.geo_code)
        & scenario.YEAR.isin([2015, 2030, 2050])
    ]
    
    # rebase scenario households (dwelling) numbers on baseline households - this is what simim sees as input
    scenario = scenario.merge(baseline[['YEAR','GEOGRAPHY_CODE','DWELLINGS','HOUSEHOLDS']], on=['YEAR','GEOGRAPHY_CODE'])
    scenario.HOUSEHOLDS_x = scenario.HOUSEHOLDS_x - scenario.DWELLINGS + scenario.HOUSEHOLDS_y
    scenario = scenario.drop(['HOUSEHOLDS_y'], axis=1).rename(columns={'HOUSEHOLDS_x':'HOUSEHOLDS'})    
    
    scenario.GVA = scenario.GVA.round(6)
    scenario.JOBS = (scenario.JOBS * 1000).round().astype(int)  # convert from 1000s jobs to jobs
    scenario.HOUSEHOLDS = scenario.HOUSEHOLDS.round().astype(int)
    
    output = pd.read_csv(os.path.join(output_path)) \
    .rename(columns={'PROJECTED_YEAR_NAME': 'YEAR'})
    
    output = scenario.merge(output, on=["YEAR", "GEOGRAPHY_CODE"], how='left') \
        .drop(['PEOPLE_SNPP', 'RELATIVE_DELTA'], axis=1)
    
    output['SCENARIO'] = scenario_key
    
    return output

In [None]:
baseline['EXPERIMENT'] = 'baseline'
dfs = [baseline]
for experiment, result in paths:
    if 'od_rail' not in result:
        continue
        
    path = os.path.join('../simim/data/output', experiment, result)
    
    # regex to find scenario
    m = re.search(r'scenario([^_]+)', path)
    if m:
        scen = m.group(1)
    else:
        scen = path
        
    df = read_output_and_scenario(arc_lads, baseline, scen, path)
    df.pivot(index='YEAR',columns='GEOGRAPHY_NAME', values='PEOPLE').plot(
        title=experiment + ' ' + scen
    )
    df['EXPERIMENT'] = experiment
    
    dfs.append(df)
        
dataset = pd.concat(dfs, axis=0, sort=True)

In [None]:
dataset.head(1)

In [None]:
dataset.tail(1)

In [None]:
dataset['PPH'] = dataset.PEOPLE / dataset.HOUSEHOLDS

In [None]:
dataset[dataset.PPH < 2]

In [None]:
summary = dataset.groupby(["YEAR",'SCENARIO', 'EXPERIMENT']).sum()
summary.PPH = summary.PEOPLE / summary.HOUSEHOLDS
summary

In [None]:
summary = summary.reset_index()
summary = summary.merge(
        summary[summary.SCENARIO == 'baseline'][['YEAR','PPH']], 
        on='YEAR', how='left', suffixes=('','_BASELINE'))

In [None]:
summary['EXP_POP'] = summary.HOUSEHOLDS * summary.PPH_BASELINE
summary['PEOPLE_SCALE_FACTOR'] = summary.EXP_POP / summary.PEOPLE

In [None]:
summary

In [None]:
dataset = dataset.merge(
    summary[['YEAR','SCENARIO', 'EXPERIMENT', 'PEOPLE_SCALE_FACTOR']], 
    on=['YEAR','SCENARIO','EXPERIMENT'], how='left')
dataset

In [None]:
dataset['SCALED_PEOPLE'] = dataset.PEOPLE * dataset.PEOPLE_SCALE_FACTOR
dataset['SCALED_PPH'] = dataset.SCALED_PEOPLE / dataset.HOUSEHOLDS
dataset

In [None]:
dataset[dataset.SCALED_PPH < 2]

In [None]:
pivot = dataset.pivot_table(index=['GEOGRAPHY_CODE','GEOGRAPHY_NAME','YEAR'], columns=['EXPERIMENT', 'SCENARIO'])
pivot

In [None]:
pivot.to_csv('scaled_factor_experiments.csv')