# Population

## Setup

In [159]:
from matplotlib.colors import ListedColormap 
from matplotlib.ticker import (MultipleLocator, FormatStrFormatter, AutoMinorLocator)
from lib import formatting as sd_formatting
import pyodbc 
import toml
import config_logging
import logging
import urllib.request
import pandas as pd
import colorcet as cc
import numpy as np

log = logging.getLogger(__name__)
external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8')

properties = toml.load("./.streamlit/secrets.toml")

database_props = properties[properties["database"]["flavour"]] 


## Reload

In [160]:
### Set up

%load_ext autoreload
%autoreload 3

from data.daos import dao_facade_local as dao_fac
from lib import db_tools as db_tools
from lib import masters_data_analytics_lib as mlib
from lib import stats as stats

db_conn = None
db_conn = db_tools.get_db_conn(database_props)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2022-08-27 17:41:16,594 [INFO] lib.db_tools: DATABASE CONNECTIVITY


## Search Criteria

In [161]:
year_from = 2012
year_to   = 2021
city      = "London"

borough   = "Islington"
ward_name = "Holloway"

## 27% Black African Carribean at Borough Level
borough   = "Lewisham"
ward_name = "Bellingham"

borough   = "Islington"
ward_name = "Holloway"

## 27% Black African Carribean at Borough Level
borough   = "Lewisham"
ward_name = "Bellingham"

borough   = "Barking and Dagenham"
ward_name = "Abbey"

## FEMALE
### HIGH RATION
borough   = "Croydon"
ward_name = "New Addington North"

### LOW RATIO
# borough   = "Westminster"
# ward_name = "Bloomsbury"

# borough   = "Westminster"    # Highest
# borough   = "Brent"          # Middle
# borough   = "City of London" # Lowest

## We may not have data in the date range we have chosen
## this is true for ALL so fix it Neal

population_year_from = year_from
population_year_to   = year_to

# What do we have? We are only using the max for now
population_min_max_year_df = dao_fac.population_min_max_year(db_conn)

population_year_min = population_min_max_year_df["MIN_YEAR"].values[0]
population_year_max = population_min_max_year_df["MAX_YEAR"].values[0]

population_year_from_orig = population_year_from
population_year_to_orig = population_year_to

if population_year_from < int(population_year_min):
    population_year_from = int(population_year_min)
elif population_year_from > int(population_year_max):
    population_year_from = int(population_year_max)

if population_year_to > int(population_year_max):
    population_year_to = int(population_year_max)
elif population_year_to < int(population_year_min):
    population_year_to = int(population_year_min)

log.debug(f"population orig_year_to    :{population_year_to_orig}")
log.debug(f"population search_year_to  :{population_year_to}")

search_term = {"year_from":population_year_from,
               "year_to":population_year_to,
               "borough":borough,
               "ward_name":ward_name}

log.debug(search_term)


2022-08-27 17:41:16,727 [DEBUG] __main__: population orig_year_to    :2021
2022-08-27 17:41:16,728 [DEBUG] __main__: population search_year_to  :2011
2022-08-27 17:41:16,729 [DEBUG] __main__: {'year_from': 2011, 'year_to': 2011, 'borough': 'Westminster', 'ward_name': 'Bloomsbury'}


## Build the Data

In [162]:
population_by_borough_ward_year_df = dao_fac.population_year(db_conn, search_term)

2022-08-27 17:41:16,859 [DEBUG] root: retrieving london population year




# BOROUGH WARD LEVEL

## GENDER

### DATA

In [163]:
## BOROUGH WARD LEVEL

## PERCENTAGES MALE - FEMALE - STUDENT 
### make a copy so we can go back
pct_population_by_borough_ward_year_df = population_by_borough_ward_year_df[["YEAR", "BOROUGH", "WARD_NAME", "ALL", "MALE", "FEMALE", "STUDENT"]].copy()
pct_population_by_borough_ward_year_df["MALE"] = pct_population_by_borough_ward_year_df["MALE"]/pct_population_by_borough_ward_year_df["ALL"]
pct_population_by_borough_ward_year_df["FEMALE"] = pct_population_by_borough_ward_year_df["FEMALE"]/pct_population_by_borough_ward_year_df["ALL"]
pct_population_by_borough_ward_year_df["STUDENT"] = pct_population_by_borough_ward_year_df["STUDENT"]/pct_population_by_borough_ward_year_df["ALL"]

### DISPLAY

In [164]:
gender_population_by_borough_ward_year_filtered = pct_population_by_borough_ward_year_df.loc[(population_by_borough_ward_year_df["BOROUGH"]==borough)
                                                                                    & (population_by_borough_ward_year_df["WARD_NAME"]==ward_name)]

gender_population_by_borough_ward_year_filtered = gender_population_by_borough_ward_year_filtered.drop(["YEAR","STUDENT"], axis=1)
gender_population_by_borough_ward_year_filtered.T


Unnamed: 0,670
BOROUGH,Westminster
WARD_NAME,Bloomsbury
ALL,229.0
MALE,0.668122
FEMALE,0.331878


## POPULATION

### DATA

In [165]:
### There seem to be some extreme values in the ward level pph
### These will be removed when binning and if the value falls into that
### category then they will be in the highest bin

pph_population_by_borough_ward_year_df = population_by_borough_ward_year_df[["YEAR", "BOROUGH", "WARD_NAME", "DENSITY_PPH"]].copy()

min_borough_ward_density_pph = pph_population_by_borough_ward_year_df["DENSITY_PPH"].quantile(.01)
max_borough_ward_density_pph = pph_population_by_borough_ward_year_df["DENSITY_PPH"].quantile(.99)

log.debug(f"min_borough_ward_density_pph:{min_borough_ward_density_pph}")
log.debug(f"max_borough_ward_density_pph:{max_borough_ward_density_pph}")

## Creat3 the labels and bin
pph_population_by_borough_ward_lbls = ["Scarcely Populated", "Slightly Populated", "Averagely Populated", "Highly Populated",  "Very Highly Populated"]
pph_population_by_borough_ward_bins = np.arange(min_borough_ward_density_pph
                                              , max_borough_ward_density_pph-min_borough_ward_density_pph
                                              , (max_borough_ward_density_pph-min_borough_ward_density_pph)/len(pph_population_by_borough_ward_lbls)+1).tolist()
pph_population_by_borough_ward_bins.append(np.inf)
log.debug(pph_population_by_borough_ward_bins)

# log.debug(f"{(max_borough_ward_density_pph - min_borough_ward_density_pph)/len(pph_population_by_borough_ward_lbls)}")
# log.debug(f"pph_population_by_borough_ward_lbls:{pph_population_by_borough_ward_lbls}")
# log.debug(f"pph_population_by_borough_ward_bins:{pph_population_by_borough_ward_bins}")

pph_population_by_borough_ward_year_df['POPULATION_STATUS'] = pd.cut(pph_population_by_borough_ward_year_df["DENSITY_PPH"], bins=pph_population_by_borough_ward_bins, labels=pph_population_by_borough_ward_lbls)

2022-08-27 17:41:19,426 [DEBUG] __main__: min_borough_ward_density_pph:7.766666666666667
2022-08-27 17:41:19,427 [DEBUG] __main__: max_borough_ward_density_pph:293.08796992481206
2022-08-27 17:41:19,429 [DEBUG] __main__: [7.766666666666667, 65.83092731829575, 123.89518796992483, 181.95944862155395, 240.02370927318302, inf]


### DISPLAY

In [166]:
## Filter to just our borough and ward
pph_population_by_borough_ward_year_filtered = pph_population_by_borough_ward_year_df.loc[(pph_population_by_borough_ward_year_df["BOROUGH"]==borough)
                                                                                        & (pph_population_by_borough_ward_year_df["WARD_NAME"]==ward_name)]

pph_population_by_borough_ward_year_filtered = pph_population_by_borough_ward_year_filtered.drop(["YEAR"], axis=1)
pph_population_by_borough_ward_year_filtered

# ## Make a display version
borough_ward_population_table = pph_population_by_borough_ward_year_filtered.copy()
borough_ward_population_table["DENSITY_PPH"] = round(borough_ward_population_table["DENSITY_PPH"],2)
borough_ward_population_table.columns = [["Borough", "Ward", "Persons per Hectare", "Population Level"]]

borough_ward_population_table

Unnamed: 0,Borough,Ward,Persons per Hectare,Population Level
670,Westminster,Bloomsbury,72.5,Slightly Populated


# BOROUGH LEVEL

## GENDER

### DATA

In [167]:
## BOROUGH LEVEL
population_by_borough_year_df = population_by_borough_ward_year_df.groupby(["YEAR", "BOROUGH"], as_index=False)\
                                                                  .agg({"ALL":"sum", "MALE":"sum", "FEMALE":"sum", "STUDENT":"sum", "DENSITY_PPH":"mean" })

## PERCENTAGES MALE - FEMALE - STUDENT 
### make a copy so we can go back
pct_population_by_borough_year_df = population_by_borough_year_df[["YEAR", "BOROUGH", "ALL", "MALE", "FEMALE", "STUDENT"]].copy()
pct_population_by_borough_year_df["MALE"] = pct_population_by_borough_year_df["MALE"]/pct_population_by_borough_year_df["ALL"].astype(float)
pct_population_by_borough_year_df["FEMALE"] = pct_population_by_borough_year_df["FEMALE"]/pct_population_by_borough_year_df["ALL"].astype(float)
pct_population_by_borough_year_df["STUDENT"] = pct_population_by_borough_year_df["STUDENT"]/pct_population_by_borough_year_df["ALL"].astype(float)

### DISPLAY

In [168]:
gender_population_by_borough_year_filtered = pct_population_by_borough_year_df.loc[(pct_population_by_borough_year_df["BOROUGH"]==borough)]
gender_population_by_borough_year_filtered = gender_population_by_borough_year_filtered.drop(["YEAR","STUDENT"], axis=1)
gender_population_by_borough_year_filtered.T

Unnamed: 0,32
BOROUGH,Westminster
ALL,222681.0
MALE,0.509482
FEMALE,0.490518


## POPULATION

### DATA

In [169]:
pph_population_by_borough_year_df = population_by_borough_year_df[["YEAR", "BOROUGH", "DENSITY_PPH"]].copy()
pph_population_by_borough_year_df["DENSITY_PPH"] = pph_population_by_borough_year_df["DENSITY_PPH"].astype(float)


min_borough_density_pph = pph_population_by_borough_year_df["DENSITY_PPH"].quantile(.01)
max_borough_density_pph = pph_population_by_borough_year_df["DENSITY_PPH"].quantile(.99)


# min_borough_density_pph = pph_population_by_borough_year_df.min().values[2]
# max_borough_density_pph = pph_population_by_borough_year_df.max().values[2]

log.debug(f"min_borough_density_pph:{min_borough_density_pph}")
log.debug(f"max_borough_density_pph:{max_borough_density_pph}")

## Create the labels and bin
pph_population_by_borough_lbls = ["Scarcely Populated", "Slightly Populated", "Averagely Populated", "Highly Populated",  "Very Highly Populated"]
pph_population_by_borough_bins = np.arange((max_borough_density_pph-min_borough_density_pph)/len(pph_population_by_borough_lbls)-1, max_borough_density_pph-min_borough_density_pph, (max_borough_density_pph-min_borough_density_pph)/len(pph_population_by_borough_lbls)-1).tolist()
pph_population_by_borough_bins.append(np.inf)

# log.debug(pph_population_by_borough_bins)
# log.debug(f"{(max_borough_density_pph - min_borough_density_pph)/len(pph_population_by_borough_lbls)}")
# log.debug(f"pph_population_by_borough_lbls:{pph_population_by_borough_lbls}")
# log.debug(f"pph_population_by_borough_bins:{pph_population_by_borough_bins}")

pph_population_by_borough_year_df['POPULATION_STATUS'] = pd.cut(pph_population_by_borough_year_df["DENSITY_PPH"], bins=pph_population_by_borough_bins, labels=pph_population_by_borough_lbls)

2022-08-27 17:41:19,990 [DEBUG] __main__: min_borough_density_pph:44.87246023949719
2022-08-27 17:41:19,991 [DEBUG] __main__: max_borough_density_pph:215.0413051899063


### DISPLAY

In [187]:
## Sort on population density
pph_population_by_borough_year_df_sorted = pph_population_by_borough_year_df.sort_values(by=["DENSITY_PPH"], ascending=False).copy()
## Drop the Year
pph_population_by_borough_year_df_sorted = pph_population_by_borough_year_df_sorted.drop(["YEAR"], axis=1)

## Make a display version
borough_population_table = pph_population_by_borough_year_df_sorted.copy()
borough_population_table["DENSITY_PPH"] = round(borough_population_table["DENSITY_PPH"],2)
borough_population_table.columns = [["Borough", "Persons per Hectare", "Population Level"]]

## Create the shading
def borough_population_shading(row, cell_shading, match, hex_colour):
    
    cell_colour = "" 
    borough_name = row.iloc[0]
    
    if borough_name == match:
        cell_colour = hex_colour
    
    cell_shading.append([cell_colour, cell_colour , cell_colour])

borough_population_table_shading = []
borough_population_table.apply(lambda row:  borough_population_shading(row, borough_population_table_shading, borough, "#F9E79F"), axis=1)
# borough_population_table_shading


29    None
19    None
32    None
12    None
11    None
5     None
24    None
18    None
27    None
21    None
31    None
13    None
30    None
3     None
8     None
22    None
10    None
0     None
9     None
17    None
23    None
25    None
1     None
7     None
20    None
14    None
26    None
28    None
2     None
16    None
15    None
4     None
6     None
dtype: object

# CITY LEVEL

## GENDER

### DATA

In [188]:
## CITY LEVEL
population_by_city_year_df = population_by_borough_ward_year_df.groupby(["YEAR"], as_index=False)\
                                                                  .agg({"ALL":"sum", "MALE":"sum", "FEMALE":"sum", "STUDENT":"sum", "DENSITY_PPH":"mean" })

## PERCENTAGES MALE - FEMALE - STUDENT 
### make a copy so we can go back
pct_population_by_city_year_df = population_by_city_year_df[["YEAR", "ALL", "MALE", "FEMALE", "STUDENT"]].copy()
pct_population_by_city_year_df["MALE"] = population_by_city_year_df["MALE"]/population_by_city_year_df["ALL"].astype(float)
pct_population_by_city_year_df["FEMALE"] = population_by_city_year_df["FEMALE"]/population_by_city_year_df["ALL"].astype(float)
pct_population_by_city_year_df["STUDENT"] = population_by_city_year_df["STUDENT"]/population_by_city_year_df["ALL"].astype(float)

pct_population_by_city_year_df

Unnamed: 0,YEAR,ALL,MALE,FEMALE,STUDENT
0,2011.0,8355866.0,0.493662,0.506338,0.012217


### DISPLAY

In [189]:
gender_population_by_city_year_filtered = pct_population_by_city_year_df.copy()
gender_population_by_city_year_filtered = gender_population_by_city_year_filtered.drop(["YEAR","STUDENT"], axis=1)
gender_population_by_city_year_filtered.T

Unnamed: 0,0
ALL,8355866.0
MALE,0.4936621
FEMALE,0.5063379


### COMBINE GENDER PCT WARD - BOROUGH - CITY

In [190]:
def pct_fmt(val):
    val_fmt = "{:,.2f}%".format(round(val*100),2)
    return val_fmt

def gender_fmt(val):
    
    val_fmt = val.capitalize()
    return val_fmt

## WARD
gender_pct_ward_table = gender_population_by_borough_ward_year_filtered.T[3:]
gender_pct_ward_table.columns = [ward_name]
gender_pct_ward_table[ward_name] = gender_pct_ward_table[ward_name].apply(lambda val: pct_fmt(val))


## Borough
gender_pct_borough_table = gender_population_by_borough_year_filtered.T[2:]
gender_pct_borough_table.columns = [borough]
gender_pct_borough_table[borough] = gender_pct_borough_table[borough].apply(lambda val: pct_fmt(val))

## City
gender_pct_city_table = gender_population_by_city_year_filtered.T[1:]
gender_pct_city_table.columns = [city]
gender_pct_city_table[city] = gender_pct_city_table[city].apply(lambda val: pct_fmt(val))

gender_pct_city_table = pd.concat([gender_pct_ward_table, gender_pct_borough_table, gender_pct_city_table], axis=1)
gender_pct_city_table["Gender"] = gender_pct_city_table.index
gender_pct_city_table = gender_pct_city_table.reset_index(drop=True)

gender_pct_city_table = gender_pct_city_table[["Gender", ward_name, borough, city]].sort_values(["Gender"])
gender_pct_city_table["Gender"] = gender_pct_city_table["Gender"].apply(lambda val:gender_fmt(val))
gender_pct_city_table



Unnamed: 0,Gender,Bloomsbury,Westminster,London
1,Female,33.00%,49.00%,51.00%
0,Male,67.00%,51.00%,49.00%


In [193]:
def gender_pct_comparison_cell_shading(row, cell_shading):

  inc_shades =["", "#EAFAF1", "#D5F5E3", "#ABEBC6", "#82E0AA", "#58D68D"]
  dec_shades =["", "#F5EEF8", "#EBDEF0", "#D7BDE2", "#C39BD3", "#AF7AC5"]

  ## Borough to City Check
  ward_val    = float(row.iloc[1].split("%")[0].strip())
  borough_val = float(row.iloc[2].split("%")[0].strip())
  city_val    = float(row.iloc[3].split("%")[0].strip())

  diff_ward = city_val - ward_val
  if diff_ward < 0:
      ward_shades = inc_shades 
  else:
      ward_shades = dec_shades 

  diff_borough =   city_val - borough_val
  if diff_borough >= 0:
      borough_shades = inc_shades 
  else:
      borough_shades = dec_shades 

  ward_val_cell_col = ward_shades[0] if abs(diff_ward) <  5.0 else \
                      ward_shades[1] if abs(diff_ward) < 10.0 else \
                      ward_shades[2] if abs(diff_ward) < 15.0 else \
                      ward_shades[3] if abs(diff_ward) < 20.0 else \
                      ward_shades[4]

  borough_val_col   = borough_shades[0] if abs(diff_borough) <  5.0 else \
                      borough_shades[1] if abs(diff_borough) < 10.0 else \
                      borough_shades[2] if abs(diff_borough) < 15.0 else \
                      borough_shades[3] if abs(diff_borough) < 20.0 else \
                      borough_shades[4] 

  cell_shading.append(["", ward_val_cell_col, borough_val_col, ""])
    
gender_pct_city_table_shading = []
gender_pct_city_table.apply(lambda row: gender_pct_comparison_cell_shading(row, gender_pct_city_table_shading), axis=1)
gender_pct_city_table_shading

[['', '#D7BDE2', '', ''], ['', '#ABEBC6', '', '']]

# NARRATIVES

In [192]:
###
### POPULATON NARRATIVE 01 
###

population_narrative_01 = ""
population_in_not_in = "in" if ((population_year_to >= population_year_from_orig) &
                             (population_year_to <= population_year_to)) else "outside"

population_search_range = f"of {population_year_from_orig} to {population_year_to_orig}" if population_year_from_orig != population_year_to_orig else f"{population_year_to_orig}"
population_narrative_search_criters = f"Using the latest population data from {population_year_to} which is {population_in_not_in} your search range {population_search_range}"

ward_borough_population_level = pph_population_by_borough_ward_year_filtered["POPULATION_STATUS"].values[0]
population_narrative_01 = f"{population_narrative_search_criters}. The table {{}} shows that {ward_name} in {borough} is {ward_borough_population_level} compared to other {city} wards."

borough_population_level = pph_population_by_borough_year_df_sorted.loc[pph_population_by_borough_year_df_sorted["BOROUGH"] == borough]["POPULATION_STATUS"].values[0]
borough_to_ward_population_level = "are the same" if ward_borough_population_level == borough_population_level else "are at different levels"
population_narrative_02 = f"The table {{}} shows that {borough} is {borough_population_level} compared to other {city} boroughs and where it ranks in population per hectars." + \
                          f" The population level at ward and borough {borough_to_ward_population_level}."


log.debug(f"\n{population_narrative_01}")
log.debug("")
log.debug(f"\n{population_narrative_02}")

2022-08-27 18:12:04,168 [DEBUG] __main__: 
Using the latest population data from 2011 which is outside your search range of 2012 to 2021. The table {} shows that Bloomsbury in Westminster is Slightly Populated compared to other London wards.
2022-08-27 18:12:04,169 [DEBUG] __main__: 
2022-08-27 18:12:04,170 [DEBUG] __main__: 
The table {} shows that Westminster is Very Highly Populated compared to other London boroughs and where it ranks in population per hectars. The population level at ward and borough are at different levels.


# REPORT PAGE

# REPORT GENERATION

## ARTEFACTS

In [223]:
 ### Set up
%load_ext autoreload

%autoreload 3
from data.daos import dao_facade_local as dao_fac
from lib import masters_data_analytics_lib as mlib
from managers.sections import sd_report_section_02_Population as report_section_education

session_id = "SECTION_TEST_02_POPULATION"
report_context = {}
search_term = {
    "city"      : city
  , "borough"   : borough
  , "ward_name" : ward_name
  , "year_to"   : year_to
  , "year_from" : year_from
}

report_section_education.generate_report_section(session_id     = session_id
                                               , search_term    = search_term
                                               , report_context = report_context
                                               , properties     = properties 
                                               , dao_fac        = dao_fac)  

log.debug(report_context)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2022-08-27 19:00:54,445 [INFO] lib.db_tools: DATABASE CONNECTIVITY
2022-08-27 19:00:54,446 [DEBUG] managers.sections.sd_report_section_02_Population: population orig_year_to    :2021
2022-08-27 19:00:54,447 [DEBUG] managers.sections.sd_report_section_02_Population: population search_year_to  :2011
2022-08-27 19:00:54,448 [DEBUG] managers.sections.sd_report_section_02_Population: {'year_from': 2011, 'year_to': 2011, 'borough': 'Westminster', 'ward_name': 'Bloomsbury'}
2022-08-27 19:00:54,458 [DEBUG] managers.sections.sd_report_section_02_Population: min_borough_ward_density_pph:7.766666666666667
2022-08-27 19:00:54,459 [DEBUG] managers.sections.sd_report_section_02_Population: max_borough_ward_density_pph:293.08796992481206
2022-08-27 19:00:54,460 [DEBUG] managers.sections.sd_report_section_02_Population: [7.766666666666667, 65.83092731829575, 123.89518796992483, 181.95944862155395, 240.0237092731830

## REPORT

In [224]:
%load_ext autoreload

%autoreload 3

from managers import sd_report_manager_new as report_man

report_context_new = {"template_processor_file_name":"./reports/processors/sd_test_generation_02_population.json",
                      "report_option":1} 

report_context.update(report_context_new)

log.debug("Started")
generated_report = report_man.generate_report(session_id
                                            , report_context = report_context
                                            , properties = properties)
log.debug("Finished")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
2022-08-27 19:00:56,805 [DEBUG] managers.sd_report_manager_new: Started
2022-08-27 19:00:56,817 [DEBUG] managers.sd_report_manager_new: text_alignment:justify
2022-08-27 19:00:56,879 [DEBUG] managers.sd_report_manager_new: text_alignment:justify
2022-08-27 19:00:57,295 [DEBUG] managers.sd_report_manager_new: text_alignment:justify
2022-08-27 19:00:57,357 [DEBUG] managers.sd_report_manager_new: Finished
