In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels as sm
import geopandas as gpd


In [2]:
#load data
child_data_raw = pd.read_csv("../data/raw/child_w5_anon_v1.0.0-stata11.csv", low_memory=False)
column_names = child_data_raw.columns
print(column_names)

child_data_raw.shape #should be 14994 households?

Index(['Unnamed: 0', 'w5_c_outcome', 'w5_hhid', 'pid', 'w5_c_intrv_c',
       'w5_c_intrv_d', 'w5_c_intrv_m', 'w5_c_intrv_y', 'w5_c_refexpl',
       'w5_c_refexpl_o',
       ...
       'w5_c_intlng9', 'w5_c_intlng10', 'w5_c_intlng11', 'w5_c_intresp',
       'w5_c_intrespact', 'w5_c_intresphear', 'w5_c_intrespque',
       'w5_c_intresppid1', 'w5_c_intresppid2', 'w5_c_intresppid3'],
      dtype='object', length=323)


(14993, 323)

In [34]:
#variables of interest

#w5_hhid	Household identifier
#pid = personal identifier
#w5_c_gen	b1 - Gender	
#w5_c_popgrp	b2 - Population group
#w5_c_lng	b3 - Home language

# w5_c_lvevoth	b6 - Child ever lived in another suburb/town/village	
# w5_c_brnprov	b7_3 - Province respondent born in	
# w5_c_brndc_2001	b7_5 - District Council of Birth (Coded using census 2001)	
# w5_c_brndc_2011	b7_7 - District Council of Birth (Coded using census 2011)	

# w5_c_hlser	d9 - The child has/had any illnesses or disabilities?	
# w5_c_hl1	d10_1 - What is the main serious illness or disability? 1st answer	
# w5_c_hl2	d10_2 - What is the main serious illness or disability? 2nd answer	
# w5_c_hl3	d10_3 - What is the main serious illness or disability? 3rd answer	
# w5_c_hl4	d10_4 - What is the main serious illness or disability? 4th answer	
# w5_c_hl_o	d10_11_o - Other: What is main illness or disability?
#w5_c_hlchckup	d11 - Number of times health professional was consulted in last 12 months

# w5_c_edatt	c3 - Has the child ever attended school?	
# w5_c_ednoenrolexp	c4 - What is the main reason that this child has never been enrolled in school?	
# w5_c_ednoenrolexp_o	c4_o - Other: Reason child never enrolled in school	


In [3]:
#create smaller dataset of relevant data and write to file
dev_data = child_data_raw[['w5_hhid', 'pid', 'w5_c_gen', 'w5_c_popgrp', 'w5_c_lng', 'w5_c_lvevoth', 'w5_c_brnprov', 
                              'w5_c_brndc_2001', 'w5_c_brndc_2011', 'w5_c_hlser', 'w5_c_hl1', 'w5_c_hl2', 'w5_c_hl3', 
                             'w5_c_hl4', 'w5_c_hl_o', 'w5_c_hlchckup','w5_c_edatt', 'w5_c_ednoenrolexp', 'w5_c_ednoenrolexp_o']]

assert dev_data.shape==(14993, 19)

dev_data = dev_data.rename(columns={'w5_c_gen': 'gender', 
                         'w5_c_popgrp': 'population_grp', 
                         'w5_c_lng': 'home_language', 
                         'w5_c_lvevoth': 'child_lived_elsewhere', 
                         'w5_c_brnprov': 'province_birth',
                         'w5_c_brndc_2001': 'district_birth_2001', 
                         'w5_c_brndc_2011': 'district_birth_2011', 
                         'w5_c_hlser': 'illness_disability', 
                         'w5_c_hl1': 'main_illness_1', 
                         'w5_c_hl2': 'main_illness_2', 
                         'w5_c_hl3': 'main_illness_3', 
                         'w5_c_hl4': 'main_illness_4', 
                         'w5_c_hl_o': 'main_illness_other', 
                         'w5_c_hlchckup': 'number_health_checkups_last_year',
                         'w5_c_edatt': 'ever_attended_school', 
                         'w5_c_ednoenrolexp': 'reason_nonenrollment_school', 
                         'w5_c_ednoenrolexp_o': 'reason_nonenrollment_school_other'})

path = "../data/processed"

if not os.path.exists(path):
    try:
        os.makedirs(path)
    except OSError:
        print (f"Creation of the directory {path} failed")

dev_data.to_csv("%s/w5_child_devdata.csv" % path, index=False)

In [4]:
dev_data.head()

Unnamed: 0,w5_hhid,pid,gender,population_grp,home_language,child_lived_elsewhere,province_birth,district_birth_2001,district_birth_2011,illness_disability,main_illness_1,main_illness_2,main_illness_3,main_illness_4,main_illness_other,number_health_checkups_last_year,ever_attended_school,reason_nonenrollment_school,reason_nonenrollment_school_other
0,500002,401746,Male,African,Sepedi,No,,,,No,,,,,,More than once,,,
1,500002,630858,Female,African,Sepedi,No,,,,No,,,,,,More than once,,,
2,500003,763879,Female,African,Setswana,No,,,,No,,,,,,Once,,,
3,500003,782520,Male,African,Setswana,No,,,,No,,,,,,More than once,,,
4,500006,624989,Male,African,IsiTsonga,No,,,,No,,,,,,More than once,,,


## Breakdown of children with disabilities/illnesses

In [97]:
print(dev_data.illness_disability.value_counts())

percent_disability = round(sum(dev_data['illness_disability']=='Yes')/len(dev_data),2)
print(f"\n children with any known disability/illness: {percent_disability} %")

No            13059
Yes             492
Refused          37
Don't know       24
Missing           7
Name: illness_disability, dtype: int64

 children with any known disability/illness: 0.03 %


### Number of children with mental problems and/or epilepsy

In [123]:
disability_cols = [col for col in dev_data.columns if 'main_illness' in col] 
disabilities_data = dev_data[disability_cols]

# for col in disability_cols:
#     print(f"{col}:\n {disabilities_data[col].value_counts()}\n")

#get list of mental and developmental problems
#assume 'palse' could be a number of different things, so not included in dev disorder list  
dev_disorder_list = ["Pdd Nos", 
                     "Down-Syndrome", 
                     "He Was Born With A Down-Syndrome And He Has Heart Problems As Well.", 
                     "Cerebral Palsy, Adhd, Autism."]    
dev_disorder_list.append('Mental problem')

#how many people report mental illneses?
mental_ilness_sum = disabilities_data.isin(dev_disorder_list).sum()
print(f"children with known mental problems:\n  {mental_ilness_sum}\n")

#how many children have epilepsy?
epilepsy_sum = disabilities_data.isin(['Epilepsy/fits']).sum()
print(f"children with epilepsy:\n {epilepsy_sum}\n")


#how many have primary mental problem and secondary epilepsy?
mental_problem_main = disabilities_data[
    disabilities_data.main_illness_1.isin(dev_disorder_list) & 
    disabilities_data.main_illness_2.isin(['Epilepsy/fits'])
]

print(f"how many have primary mental problem and secondary epilepsy?: {len(mental_problem_main)}")

#how many have primary epilepsy with secondary mental health problem?
mental_problem_main = disabilities_data[
    disabilities_data.main_illness_2.isin(dev_disorder_list) & 
    disabilities_data.main_illness_1.isin(['Epilepsy/fits'])
]

print(f"how many have primary epilepsy and secondary mental problem?: {len(mental_problem_main)}")

children with known mental problems:
  main_illness_1        33
main_illness_2        20
main_illness_3         8
main_illness_4         0
main_illness_other     4
dtype: int64

children with epilepsy:
 main_illness_1        22
main_illness_2        11
main_illness_3         2
main_illness_4         3
main_illness_other     0
dtype: int64

how many have primary mental problem and secondary epilepsy?: 5
how many have primary epilepsy and secondary mental problem?: 0


In [173]:
# TODO: get subset with only children with mental health problems (incl epilepsy) in any category

0    14960
1       33
Name: mental_problem, dtype: int64

## What is the breakdown by province?

In [20]:
def percent_missing(column):
    return round(dev_data[column].isna().sum()/len(dev_data)*100, 2)
    
print(f"missing by province: {percent_missing('province_birth')} %")
print(f"missing by district: {percent_missing('district_birth_2011')} %")


missing by province: 85.69 %
missing by district: 85.82 %


Most data for province and district are missing - will have to get secure data

## What is school access like for children with mental health problems?

In [None]:
# TODO: tabulate number of children who have never attended school for children with mental health problems and those without mental health problems (exclude children who are too young to go to school)
# Will eventually tabulate this by province and district too

## Is there a difference between population groups?


In [4]:
#TODO: what is the breakdown of children with mental disabilites by population group and home language? 
#TODO: are these numbers representative of size of population? to do this, find size of population groups and language groups in SA on statssa.gov.za

#TODO: compare count of children with and without mental disabilities by gender 

## How do families with mental disabilities access the healthcare system?


In [3]:
#TODO: compare number_health_checkups_last_year between children with and without mental disabilities
#should eventually split this by province    