In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import jupyterthemes as jtplot
from geopy.geocoders import Nominatim
import re

%config InlineBackend.figure_format = 'retina'                                    # so you can see plots in HD :) 
#jtplot.style(theme='grade3', context='notebook', ticks=True, grid=True)

In [2]:
df_2020 = pd.read_csv(r'data/raw/huie_surveyData_2020.csv')
df_2021 = pd.read_csv(r'data/raw/huie_surveyData_2021.csv')

print(df_2021.keys())

Index(['row hash', 'your role recoded', 'orgtype: charitable',
       'orgtype: inc soc', 'orgtype: voluntary', 'orgtype: māori',
       'orgtype: faith', 'orgtype: philanthropic', 'orgtype:other',
       'paid staff',
       ...
       'sector changes: Access to information and data in one place',
       'sector changes: Funding to cover salaries and operational costs',
       'sector changes: Other (please specify)', 'important learning',
       'comments', 'share', 'again: Yes, I am willing to be surveyed again',
       'again: Yes, I am willing to be interviewed',
       'again: Yes, I would like to see the survey results',
       'again: No, I do not wish to be contacted'],
      dtype='object', length=133)


In [3]:
print(len(df_2020))
print(len(df_2021))

REGIONS = np.array(
            ["Northland/Te Tai Tokerau"
            "Auckland/Tāmakimakaurau", 
            "Waikato",
            "Bay of Plenty/Te MoanaaToi",
            "Gisborne/Te Tai Rāwhiti",
            "Hawke's Bay/Te Mataua Māui",
            "Taranaki",
            "Manawatū",
            "Whanganui",
            "Wellington/Te WhanganuiaTara",
            "Tasman/Te TaioAorere",
            "Nelson/Whakatū",
            "Marlborough/Te Tauihuotewaka",
            "West Coast /Te Tai Poutini",
            "Canterbury/Waitaha",
            "Otago/Ōtākou",
            "Southland/Murihiku",
            "All regions of the North Island/Te IkaaMāui",
            "All regions of the South Island/Te Wai Pounamu",
            "All regions of Aotearoa/New Zealand"]
)



print(max(REGIONS, key=lambda s: (len(s), s)))

n_spaces = [a.count(' ') for a in REGIONS]
print(n_spaces)

362
961
Northland/Te Tai TokerauAuckland/Tāmakimakaurau
[2, 0, 3, 2, 3, 0, 0, 0, 1, 1, 0, 1, 4, 0, 0, 0, 6, 7, 4]


In [4]:
text = 'The COVID-19 pandemic has had multiple structural/operational/financial impacts upon our tangata whenua, community and voluntary organisations in Aotearoa/New Zealand. That considered, the pandemic may have also had impacts on the wellbeing and emotional/morale status and stress levels of people in these organisations. Can you tell us a short story about your experiences?'

if False:
    print(df_2021[text].values)

In [75]:
# define function to aggregate data based on the columns
def aggregate_regions(df_in, reg_cols):
    '''
    Takes raw df as input and outputs df with aggregated 
    regions for each entry in new location column
    '''
    df_out = df_in.copy()
    
    df_out = df_out.replace(np.nan, '', regex=True)                        # replace nans for easier agreggation
    df_out['location'] = df_out[reg_cols].T.agg(','.join)             # aggregation
    df_out = df_out.replace(',,', '', regex=True)                        # gets rid of double commas
    
    return df_out

 
# define function to get rid of first and last comma
def remove_comma(df_in, which):
    '''
    Inputs df from aggregate_regions output and removes the 
    '''
    df_out = df_in.copy()

    # track what comma to remove (index) based on which input
    ind = 0 if which == 'first'  else -1
    # remove comma with lambda function
    with_comma = [(lambda s: s[ind] == ',')(s) for s in df_in['location']]
    if which == 'first': 
        df_out['location'][with_comma] = df_out['location'][with_comma].map(lambda s: str(s)[1:])
    elif which == 'last':
        df_out['location'][with_comma] = df_out['location'][with_comma].map(lambda s: str(s)[:-1])
        
    return df_out

# define function to ensure that there's no entry with an empty location
def gis_quality_check(df_in):
    '''
    Checks for empty entries in the location column for input df_in
    '''
    
    print(f'='*60)
    print(f'Carrying out data quality check:')
    print(f'-'*35)
    
    is_loc_empty =  [(lambda s: s == '')(s) for s in df_in['location']]
    n_empty = len(df_in['location'][is_loc_empty])
                
    if n_empty == 0:
        print(f'The data has no empty entries in the location column')
        print(f'='*60)
    else:
        print(f'The data has {n_empty} empty entries in the location column')
        print(f'='*60)

# call region aggregator
region_cols = ['In which region(s) does your organisation operate (please choose all that apply)'] + [f'Unnamed: {j}' for j in range(14,32+1)]
df_2020 = aggregate_regions(df_2020, region_cols)

# call function to remove comma 
df_2020 = remove_comma(df_2020, which='first')
df_2020 = remove_comma(df_2020, which='last')

# call quality data function to check for empty entries
gis_quality_check(df_2020)
    
print(df_2020.loc[5, 'location'])
print(df_2020.loc[2, 'location'])

#[is_comma_first(val) for val in df_2020['location'].values]

Carrying out data quality check:
-----------------------------------
The data has no empty entries in the location column
Auckland/Tāmaki-makau-rau,Waikato,Manawatū-Whanganui,Wellington/Te Whanga-nui-a-TaraNelson/Whakatū,Canterbury/Waitaha,Otago/Ōtākou
Southland/Murihiku


## draw map of Aotearoa

In [None]:
import geopandas as gpd

map_df = gpd.read_file("data/gis/regional-council-2022-clipped-generalised.shx")
map_ref = gpd.read_file("data/gis/regional_council_2022_clipped_csv.csv")

map_ref