In [1]:
# Importing libraries and basic settings for ease of use
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import altair as alt

%config InlineBackend.figure_format = "retina"
sns.set_context("talk")

import warnings
warnings.filterwarnings("ignore") # I suppressed the warnings

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_mode_interactivity = "all"

#from pandas import read_csv
#from pandas import datetime
#from matplotlib import pyplot
from pandas.plotting import autocorrelation_plot

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import kpss
# pip install pmdarima
import pmdarima as pm
from statsmodels.tsa.arima_model import ARIMA


In [2]:
# Read csv files into a pandas dataframe
#df_BDM = pd.read_csv("D:/BDM_Ireland1960_2021.csv")
#df_B   = pd.read_csv("D:/BirthsByCounty1985_2020.csv",sep="\t")
#df_D   = pd.read_csv("D:/deaths-region-2007-2020.csv")

#df_P   = pd.read_csv("D:/population_est_1950-2021.csv") # Commented out to remove bug

In [3]:
# Correction to make the program run
df_P   = pd.read_csv("./../assets/2021-12Dec-11-population-estimates-1950-2021-pea01.csv")

In [4]:
def missing_values(df):
    if (df.isnull().values.any()): print (df.isnull().sum()) # how many missing values?
    else: print("no missing values")

# Augmented Dickey-Fuller Test - ADF Test
def adf_test(series):
    result = adfuller(series, autolag='AIC')
    print("Augmented Dickey-Fuller Test (ADF)\n==================================")
    print(f'T={series.count()}')
    print(f'- ADF the test statistic:     {result[0]}')
    print(f'- MacKinnons approx p-value:  {result[1]}')
    print(f'- number of lags used:        {result[2]}')

    print('\nCritial Values:\n---------------')
    for key, value in result[4].items():
        if (key=="10%"): print(f'{key}   {value}')
        else: print(f' {key}   {value}')
    print("\n")
# Kwiatkowski–Phillips–Schmidt–Shin Test - KPSS Test 
def kpss_test(series, **kw):
    statistic, p_value, n_lags, critical_values = kpss(series, **kw)
    print("Kwiatkowski–Phillips–Schmidt–Shin Test (KPSS)\n============================================")    
    print(f'T={series.count()}')
    print(f'- KPSS the test statistic:   {statistic}')
    print(f'- MacKinnons approx p-value: {p_value}')
    print(f'- number of lags used:       {n_lags}')

    print('\nCritical Values:\n---------------')
    for key, value in critical_values.items():
        if (key=="2.5%"): print(f'{key}   {value}')
        elif (key=="10%"): print(f' {key}   {value}')
        else: print(f'  {key}   {value}')
    print("\n")

In [5]:
# ==================================================

In [6]:
df_P.info # Structure and dimensions of the dataset

<bound method DataFrame.info of                                     Statistic  Year          Age Group  \
0     Population Estimates (Persons in April)  1950       Under 1 year   
1     Population Estimates (Persons in April)  1950       Under 1 year   
2     Population Estimates (Persons in April)  1950       Under 1 year   
3     Population Estimates (Persons in April)  1950        0 - 4 years   
4     Population Estimates (Persons in April)  1950        0 - 4 years   
...                                       ...   ...                ...   
5827  Population Estimates (Persons in April)  2021  85 years and over   
5828  Population Estimates (Persons in April)  2021  85 years and over   
5829  Population Estimates (Persons in April)  2021           All ages   
5830  Population Estimates (Persons in April)  2021           All ages   
5831  Population Estimates (Persons in April)  2021           All ages   

             Sex      UNIT   VALUE  
0     Both sexes  Thousand    61.1  
1    

In [7]:
df_P.head() # first view of the dataset

Unnamed: 0,Statistic,Year,Age Group,Sex,UNIT,VALUE
0,Population Estimates (Persons in April),1950,Under 1 year,Both sexes,Thousand,61.1
1,Population Estimates (Persons in April),1950,Under 1 year,Male,Thousand,31.4
2,Population Estimates (Persons in April),1950,Under 1 year,Female,Thousand,29.7
3,Population Estimates (Persons in April),1950,0 - 4 years,Both sexes,Thousand,
4,Population Estimates (Persons in April),1950,0 - 4 years,Male,Thousand,


In [8]:
# Dimension reduction:::
# Seems, Columns Statistic & UNIT cannot be used because they contain only one value.
# Examining the values in the Statistic column
pd.Series(df_P["Statistic"].unique()).sort_values()

0    Population Estimates (Persons in April)
dtype: object

In [9]:
# Examining the values in the UNIT column
pd.Series(df_P["UNIT"].unique()).sort_values()

0    Thousand
dtype: object

In [10]:
# Only one value - cannot be used for analysis
# Deleting the Statistic column 
# Deleting the UNIT column 
df_P = df_P.drop(["Statistic"], axis='columns')
df_P = df_P.drop(["UNIT"], axis='columns')

In [11]:
# rename the columns
new_titles = {"Year":"Year", "Age Group":"Age", "Sex":"Sex", "VALUE":"population"}
df_P = df_P.rename(columns=new_titles)
df_P.info  # Structure and dimensions of the dataset

<bound method DataFrame.info of       Year                Age         Sex  population
0     1950       Under 1 year  Both sexes        61.1
1     1950       Under 1 year        Male        31.4
2     1950       Under 1 year      Female        29.7
3     1950        0 - 4 years  Both sexes         NaN
4     1950        0 - 4 years        Male         NaN
...    ...                ...         ...         ...
5827  2021  85 years and over        Male        31.9
5828  2021  85 years and over      Female        52.7
5829  2021           All ages  Both sexes      5011.5
5830  2021           All ages        Male      2481.1
5831  2021           All ages      Female      2530.3

[5832 rows x 4 columns]>

In [12]:
# any missing values?
missing_values(df_P)

Year            0
Age             0
Sex             0
population    114
dtype: int64


In [13]:
df_P.head()

Unnamed: 0,Year,Age,Sex,population
0,1950,Under 1 year,Both sexes,61.1
1,1950,Under 1 year,Male,31.4
2,1950,Under 1 year,Female,29.7
3,1950,0 - 4 years,Both sexes,
4,1950,0 - 4 years,Male,


In [14]:
df_PO = df_P.drop(df_P[ (df_P["Age"] == "0 - 4 years") |
                        (df_P["Age"] == "0 - 14 years") |
                        (df_P["Age"] == "15 - 24 years") |
                        (df_P["Age"] == "15 years and over") |
                        (df_P["Age"] == "25 - 44 years") |
                        (df_P["Age"] == "45 - 64 years") |
                        (df_P["Age"] == "65 years and over") |
                        (df_P["Age"] == "All ages") ].index)
df_PO.info

<bound method DataFrame.info of       Year                Age         Sex  population
0     1950       Under 1 year  Both sexes        61.1
1     1950       Under 1 year        Male        31.4
2     1950       Under 1 year      Female        29.7
9     1950        1 - 4 years  Both sexes       249.1
10    1950        1 - 4 years        Male       127.5
...    ...                ...         ...         ...
5824  2021      80 - 84 years        Male        41.0
5825  2021      80 - 84 years      Female        50.5
5826  2021  85 years and over  Both sexes        84.6
5827  2021  85 years and over        Male        31.9
5828  2021  85 years and over      Female        52.7

[4104 rows x 4 columns]>

In [15]:
df_PO = df_PO.reset_index(drop=True)
df_PO.info

<bound method DataFrame.info of       Year                Age         Sex  population
0     1950       Under 1 year  Both sexes        61.1
1     1950       Under 1 year        Male        31.4
2     1950       Under 1 year      Female        29.7
3     1950        1 - 4 years  Both sexes       249.1
4     1950        1 - 4 years        Male       127.5
...    ...                ...         ...         ...
4099  2021      80 - 84 years        Male        41.0
4100  2021      80 - 84 years      Female        50.5
4101  2021  85 years and over  Both sexes        84.6
4102  2021  85 years and over        Male        31.9
4103  2021  85 years and over      Female        52.7

[4104 rows x 4 columns]>

In [16]:
missing_values(df_PO)

no missing values


In [17]:
df_PP = df_PO

In [18]:
pd.Series(df_PP["population"].where(df_PP["Sex"] == "Both sexes").unique()).sort_values()

19       13.3
56       14.0
75       14.5
93       15.1
111      15.7
        ...  
881     399.8
911     400.6
1076    402.1
898     408.3
1         NaN
Length: 1083, dtype: float64

In [19]:
# Counting the "Both sexes" rows
df_PP.where(df_PP["Sex"] == "Both sexes").count(axis=0)

Year          1368
Age           1368
Sex           1368
population    1368
dtype: int64

In [20]:
# "Both sexes" - not need for the population pyramids
# Deleting the "Both sexes" rows

In [21]:
df_PP = df_PP.drop(df_PP[(df_PP["Sex"] == "Both sexes")].index)
df_PP.info

<bound method DataFrame.info of       Year                Age     Sex  population
1     1950       Under 1 year    Male        31.4
2     1950       Under 1 year  Female        29.7
4     1950        1 - 4 years    Male       127.5
5     1950        1 - 4 years  Female       121.6
7     1950        5 - 9 years    Male       142.3
...    ...                ...     ...         ...
4097  2021      75 - 79 years  Female        74.5
4099  2021      80 - 84 years    Male        41.0
4100  2021      80 - 84 years  Female        50.5
4102  2021  85 years and over    Male        31.9
4103  2021  85 years and over  Female        52.7

[2736 rows x 4 columns]>

In [22]:
# Counting the rows
df_PP.where(df_PP["Sex"] == "Both sexes").count(axis=0)

Year          0
Age           0
Sex           0
population    0
dtype: int64

In [23]:
df_PP = df_PP.reset_index(drop=True)
df_PP.info

<bound method DataFrame.info of       Year                Age     Sex  population
0     1950       Under 1 year    Male        31.4
1     1950       Under 1 year  Female        29.7
2     1950        1 - 4 years    Male       127.5
3     1950        1 - 4 years  Female       121.6
4     1950        5 - 9 years    Male       142.3
...    ...                ...     ...         ...
2731  2021      75 - 79 years  Female        74.5
2732  2021      80 - 84 years    Male        41.0
2733  2021      80 - 84 years  Female        50.5
2734  2021  85 years and over    Male        31.9
2735  2021  85 years and over  Female        52.7

[2736 rows x 4 columns]>

In [24]:
df_PP.where(df_PP["Age"] == "Under 1 year").count(axis=0)

Year          144
Age           144
Sex           144
population    144
dtype: int64

In [25]:
# Examining the values in the Age Group column
pd.Series(df_PP["Age"].unique()) #.sort_values()
#df_PP["Age Group"].unique()

0          Under 1 year
1           1 - 4 years
2           5 - 9 years
3         10 - 14 years
4         15 - 19 years
5         20 - 24 years
6         25 - 29 years
7         30 - 34 years
8         35 - 39 years
9         40 - 44 years
10        45 - 49 years
11        50 - 54 years
12        55 - 59 years
13        60 - 64 years
14        65 - 69 years
15        70 - 74 years
16        75 - 79 years
17        80 - 84 years
18    85 years and over
dtype: object

In [26]:
df_PP=df_PP.replace("Under 1 year"," 0-")
df_PP=df_PP.replace("1 - 4 years"," 1-")
df_PP=df_PP.replace("5 - 9 years"," 5-")
df_PP=df_PP.replace("10 - 14 years","10-")
df_PP=df_PP.replace("15 - 19 years","15-")
df_PP=df_PP.replace("20 - 24 years","20-")
df_PP=df_PP.replace("25 - 29 years","25-")
df_PP=df_PP.replace("30 - 34 years","30-")
df_PP=df_PP.replace("35 - 39 years","35-")
df_PP=df_PP.replace("40 - 44 years","40-")
df_PP=df_PP.replace("45 - 49 years","45-")
df_PP=df_PP.replace("50 - 54 years","50-")
df_PP=df_PP.replace("55 - 59 years","55-")
df_PP=df_PP.replace("60 - 64 years","60-")
df_PP=df_PP.replace("65 - 69 years","65-")
df_PP=df_PP.replace("70 - 74 years","70-")
df_PP=df_PP.replace("75 - 79 years","75-")
df_PP=df_PP.replace("80 - 84 years","80-")
df_PP=df_PP.replace("85 years and over","85-")
#df_PP.where(df_PP["Age Group"] == "Under 1 year")["Age Group"]="0"
#df_PP = df_PP.drop(df_PP[(df_PP["Sex"] == "Both sexes")].index)

In [27]:
df_PP = df_PP.reset_index(drop=True)
df_PP.info

<bound method DataFrame.info of       Year  Age     Sex  population
0     1950   0-    Male        31.4
1     1950   0-  Female        29.7
2     1950   1-    Male       127.5
3     1950   1-  Female       121.6
4     1950   5-    Male       142.3
...    ...  ...     ...         ...
2731  2021  75-  Female        74.5
2732  2021  80-    Male        41.0
2733  2021  80-  Female        50.5
2734  2021  85-    Male        31.9
2735  2021  85-  Female        52.7

[2736 rows x 4 columns]>

In [28]:
df_PP.head()

Unnamed: 0,Year,Age,Sex,population
0,1950,0-,Male,31.4
1,1950,0-,Female,29.7
2,1950,1-,Male,127.5
3,1950,1-,Female,121.6
4,1950,5-,Male,142.3


In [29]:
# ========================================================================================
# interactive population pyramid code:

In [30]:
year_selector = alt.binding_range(min=1950, max=2021, step=1, name= "year selector")

In [31]:
select_year = alt.selection_single(name="population",fields=["Year"],
                                   bind=year_selector, init={"Year":2020} )

In [32]:
trunk = alt.Chart(df_PP, title="Age").add_selection(
    select_year).transform_filter(
    select_year).transform_calculate(
    gender=alt.datum.Sex).properties(width=300)

In [33]:
tree_color = alt.Scale(domain=['Male', 'Female'], range=['blue', 'violet'])

In [34]:
male_tree = trunk.transform_filter(alt.datum.gender == 'Male').encode(
    y=alt.Y('Age:O', axis=None, sort=alt.SortOrder('descending')),
    x=alt.X('sum(population):Q', scale=alt.Scale(domain=[0,300.0]),
    title='male population (1000)', sort=alt.SortOrder('descending')),
    color=alt.Color('gender:N', scale=tree_color)
    ).mark_bar().properties(title='Male')

In [35]:
y_trunk = trunk.encode(
    y=alt.Y('Age:O', axis=None, sort=alt.SortOrder('descending')),
        text=alt.Text('Age:O')).mark_text().properties(width=20)

In [36]:
female_tree = trunk.transform_filter(alt.datum.gender == 'Female').encode(
    y=alt.Y('Age:O', axis=None, sort=alt.SortOrder('descending')),
    x=alt.X('sum(population):Q', scale=alt.Scale(domain=[0,300.0]),
    title='female population (1000)'),
    color=alt.Color('gender:N', scale=tree_color)
    ).mark_bar().properties(title='Female')

In [37]:
alt.concat(male_tree, y_trunk, female_tree, spacing=2)