# Contents

- [EDA and Data Cleaning](#EDA-and-Data-Cleaning)

# Load libraries

In [17]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [89]:
# load csv

adult_mortality = pd.read_csv('../data/Adult mortality.csv')
maternal_mortality = pd.read_csv('../data/Maternal mortality.csv')
num_death = pd.read_csv('../data/Number of deaths (thousands).csv')
prob_dying = pd.read_csv('../data/Probability of dying per 1000 live births.csv')

# EDA & Data Cleaning

## Check contents of dataframes 


In [19]:
adult_mortality.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Adult mortality rate (probability of dying between 15 and 60 years per 1000 population),Adult mortality rate (probability of dying between 15 and 60 years per 1000 population).1,Adult mortality rate (probability of dying between 15 and 60 years per 1000 population).2
0,Country,Year,Both sexes,Male,Female
1,Afghanistan,2016,245,272,216
2,Afghanistan,2015,233,254,210
3,Afghanistan,2014,234,254,213
4,Afghanistan,2013,235,254,215


In [20]:
adult_mortality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3112 entries, 0 to 3111
Data columns (total 5 columns):
 #   Column                                                                                     Non-Null Count  Dtype 
---  ------                                                                                     --------------  ----- 
 0   Unnamed: 0                                                                                 3112 non-null   object
 1   Unnamed: 1                                                                                 3112 non-null   object
 2   Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)    3112 non-null   object
 3   Adult mortality rate (probability of dying between 15 and 60 years per 1000 population).1  3112 non-null   object
 4   Adult mortality rate (probability of dying between 15 and 60 years per 1000 population).2  3112 non-null   object
dtypes: object(5)
memory usage: 121.7+ KB


Columns # 1,2,3,4 contains object likely because of the Row 0 as it contains gender as a `sub-header`


We can see that the column of interest is `Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)` split into 3 categories based on gender. 

The cells below will show the flow of how a function is created to generate compressed dataframe, with additional column called 'Gender' 

In [118]:
col_of_interest = adult_mortality.columns[2]
col_of_interest

'Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)'

In [124]:
gender_category = adult_mortality.iloc[0,2:].tolist()
gender_category

['Both sexes', 'Male', 'Female']

In [125]:
adult_mortality_compressed = adult_mortality.copy()

adult_mortality_compressed.columns = adult_mortality.iloc[0,:]

adult_mortality_compressed.drop(0, inplace = True )

In [126]:
adult_mortality_compressed.reset_index(inplace = True, drop = True)

In [127]:
adult_mortality_compressed

Unnamed: 0,Country,Year,Both sexes,Male,Female
0,Afghanistan,2016,245,272,216
1,Afghanistan,2015,233,254,210
2,Afghanistan,2014,234,254,213
3,Afghanistan,2013,235,254,215
4,Afghanistan,2012,242,262,221
...,...,...,...,...,...
3106,Zimbabwe,2004,694,725,670
3107,Zimbabwe,2003,697,728,671
3108,Zimbabwe,2002,695,728,667
3109,Zimbabwe,2001,688,723,656


In [128]:
def rename_columns(df, col_of_interest):

    col_list = ['Country', 'Year',col_of_interest, 'Sex']
    
    df.columns = col_list

    return df

In [129]:
def create_df(ori_df, which_sex):

    df = ori_df

    df['Sex'] = which_sex
    
    return df

In [130]:
create_df(adult_mortality_compressed[['Country','Year', 'Male']], 'Male')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,Country,Year,Male,Sex
0,Afghanistan,2016,272,Male
1,Afghanistan,2015,254,Male
2,Afghanistan,2014,254,Male
3,Afghanistan,2013,254,Male
4,Afghanistan,2012,262,Male
...,...,...,...,...
3106,Zimbabwe,2004,725,Male
3107,Zimbabwe,2003,728,Male
3108,Zimbabwe,2002,728,Male
3109,Zimbabwe,2001,723,Male


In [142]:
def generateList_df(df, col_of_interest, gender_cat):

    empty_list = []

    for cat in gender_cat:
        df_new = create_df(df[['Country','Year', cat]], cat)
        empty_list.append(rename_columns(df_new, col_of_interest))

    return empty_list

In [143]:
adult_mortality_compressed =  pd.concat(generateList_df(adult_mortality_compressed,col_of_interest, gender_category))

adult_mortality_compressed

Unnamed: 0,Country,Year,Adult mortality rate (probability of dying between 15 and 60 years per 1000 population),Sex
0,Afghanistan,2016,245,Both sexes
1,Afghanistan,2015,233,Both sexes
2,Afghanistan,2014,234,Both sexes
3,Afghanistan,2013,235,Both sexes
4,Afghanistan,2012,242,Both sexes
...,...,...,...,...
3106,Zimbabwe,2004,670,Female
3107,Zimbabwe,2003,671,Female
3108,Zimbabwe,2002,667,Female
3109,Zimbabwe,2001,656,Female


In [185]:
# define a function that creates compressed table 
# assuming column 2 is the only main column of interest (already filtered by only One topic)


def rename_columns(df, col_of_interest):

    # establish list of columns to be your new header
    col_list = ['Country', 'Year',col_of_interest, 'Sex']

    # replace df columns with col_list
    df.columns = col_list

    return df

def create_df(ori_df, which_sex):

    # copy main df
    df = ori_df.copy()

    # add one more column to display and segregate by Sex column 
    df['Sex'] = which_sex
    
    return df

def generateList_df(df, col_of_interest, gender_cat):

    # instantiate empty list
    empty_list = []

    # for each gender, create separate table 
    # then append into empty list 
    for cat in gender_cat:
        df_new = create_df(df[['Country','Year', cat]], cat)
        empty_list.append(rename_columns(df_new, col_of_interest))

    return empty_list


def compress_df(df, col_of_interest):
    # establish gender category 
    gender_category = df.iloc[0,2:].tolist()

    # copy out original
    df_compress = df.copy()

    # make first row the header, and reset index
    df_compress.columns = df.iloc[0,:]
    df_compress.drop(0, inplace = True )
    df_compress.reset_index(inplace = True, drop = True)
    
    # use generateList_df to get new dataframe with only one column of interest segregated by sex category on a new column
    df_compress = pd.concat(generateList_df(df_compress,col_of_interest,gender_category))
    
    # convert all to lower caps for columns 
    df_compress.columns = [x.lower() for x in df_compress.columns.tolist()]
    
    return df_compress

In [186]:
adult_mortality_compressed = compress_df(adult_mortality, 'adult_mortality')

In [187]:
adult_mortality_compressed

Unnamed: 0,country,year,adult_mortality,sex
0,Afghanistan,2016,245,Both sexes
1,Afghanistan,2015,233,Both sexes
2,Afghanistan,2014,234,Both sexes
3,Afghanistan,2013,235,Both sexes
4,Afghanistan,2012,242,Both sexes
...,...,...,...,...
3106,Zimbabwe,2004,670,Female
3107,Zimbabwe,2003,671,Female
3108,Zimbabwe,2002,667,Female
3109,Zimbabwe,2001,656,Female


In [181]:
num_afterFiveDeath = compress_df(num_death[['Unnamed: 0', 'Unnamed: 1', 'Number of under-five deaths (thousands)',
       'Number of under-five deaths (thousands).1',
       'Number of under-five deaths (thousands).2']], 'no_afterfivedeath')

In [182]:
num_afterFiveDeath

Unnamed: 0,country,year,no_afterfivedeath,sex
0,Afghanistan,2018,74278,Both sexes
1,Afghanistan,2017,76877,Both sexes
2,Afghanistan,2016,79770,Both sexes
3,Afghanistan,2015,82918,Both sexes
4,Afghanistan,2014,86378,Both sexes
...,...,...,...,...
1159,Zimbabwe,2017,10100,Female
1160,Zimbabwe,2016,10459,Female
1161,Zimbabwe,2015,11432,Female
1162,Zimbabwe,2014,12192,Female


In [183]:
num_infantDeath = compress_df(num_death[['Unnamed: 0', 'Unnamed: 1','Number of infant deaths (thousands)',
       'Number of infant deaths (thousands).1',
       'Number of infant deaths (thousands).2']], 'no_infant_death')

In [184]:
num_infantDeath

Unnamed: 0,country,year,no_infant_death,sex
0,Afghanistan,2018,57182,Both sexes
1,Afghanistan,2017,58846,Both sexes
2,Afghanistan,2016,60673,Both sexes
3,Afghanistan,2015,62652,Both sexes
4,Afghanistan,2014,64808,Both sexes
...,...,...,...,...
1159,Zimbabwe,2017,7005,Female
1160,Zimbabwe,2016,7297,Female
1161,Zimbabwe,2015,7885,Female
1162,Zimbabwe,2014,8344,Female


In [None]:
adult_mortality['Unnamed: 1'].unique()

array(['Year', '2016', '2015', '2014', '2013', '2012', '2011', '2010',
       '2009', '2008', '2007', '2006', '2005', '2004', '2003', '2002',
       '2001', '2000'], dtype=object)

In [None]:
num_afterFiveDeath.Year.unique()

array(['2018', '2017', '2016', '2015', '2014', '2013'], dtype=object)

In [None]:
num_infantDeath.Year.unique()

array(['2018', '2017', '2016', '2015', '2014', '2013'], dtype=object)

In [None]:
adult_mortality.columns = adult_mortality.iloc[0,:]

In [None]:
adult_mortality.drop(0, inplace = True)
adult_mortality.reset_index(drop = True, inplace = True)

In [None]:
template_columns = ['country', 'year', 'both_sexes', 'male', 'female']
adult_mortality.columns = template_columns

In [None]:
adult_mortality

Unnamed: 0,country,year,both_sexes,male,female
0,Afghanistan,2016,245,272,216
1,Afghanistan,2015,233,254,210
2,Afghanistan,2014,234,254,213
3,Afghanistan,2013,235,254,215
4,Afghanistan,2012,242,262,221
...,...,...,...,...,...
3106,Zimbabwe,2004,694,725,670
3107,Zimbabwe,2003,697,728,671
3108,Zimbabwe,2002,695,728,667
3109,Zimbabwe,2001,688,723,656


How many unique countries?

In [65]:
len(adult_mortality.country.unique())

183

In [69]:
list_to_int = ['both_sexes', 'male', 'female']

for col in list_to_int:
    adult_mortality[col] = adult_mortality[col].astype(int)

In [70]:
adult_mortality.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3111 entries, 0 to 3110
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   country     3111 non-null   object
 1   year        3111 non-null   object
 2   both_sexes  3111 non-null   int32 
 3   male        3111 non-null   int32 
 4   female      3111 non-null   int32 
dtypes: int32(3), object(2)
memory usage: 85.2+ KB


Visualise on average across years, which are the top 5 worst countries 

In [80]:
top_5_adult_mortality_rate = adult_mortality.groupby(['country']).mean().sort_values(by = 'both_sexes', ascending = False).head(5)
top_5_adult_mortality_rate

Unnamed: 0_level_0,both_sexes,male,female
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Zimbabwe,549.352941,586.882353,518.117647
Lesotho,536.235294,568.176471,515.705882
Eswatini,529.0,587.058824,481.0
Central African Republic,491.647059,502.294118,481.823529
Sierra Leone,452.117647,461.117647,443.117647


In [86]:
top_5_adult_mortality_rate.index.tolist()

['Zimbabwe', 'Lesotho', 'Eswatini', 'Central African Republic', 'Sierra Leone']

Top 5 worst countries comes from 'Zimbabwe', 'Lesotho', 'Eswatini', 'Central African Republic', 'Sierra Leone' just for adult mortality itself

Visualise across years, what are the trends for both sexes

In [None]:
maternal_mortality.head()

Unnamed: 0,Country,Year,Maternal mortality ratio (per 100 000 live births),Number of maternal deaths
0,Afghanistan,2017,638 [ 427 - 1 010 ],7 700 [ 5 100 - 12 000 ]
1,Afghanistan,2016,673 [ 457 - 1 040 ],8 100 [ 5 500 - 12 000 ]
2,Afghanistan,2015,701 [ 501 - 1 020 ],8 400 [ 6 000 - 12 000 ]
3,Afghanistan,2014,786 [ 592 - 1 080 ],9 300 [ 7 000 - 13 000 ]
4,Afghanistan,2013,810 [ 617 - 1 080 ],9 600 [ 7 300 - 13 000 ]


In [None]:
num_death.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Number of under-five deaths (thousands),Number of under-five deaths (thousands).1,Number of under-five deaths (thousands).2,Number of infant deaths (thousands),Number of infant deaths (thousands).1,Number of infant deaths (thousands).2,Number of neonatal deaths (thousands)
0,Country,Year,Both sexes,Male,Female,Both sexes,Male,Female,Both sexes
1,Afghanistan,2018,74278,40312,33966,57182,31394,25788,44725
2,Afghanistan,2017,76877,41631,35246,58846,32244,26602,45771
3,Afghanistan,2016,79770,43134,36636,60673,33222,27451,46963
4,Afghanistan,2015,82918,44733,38185,62652,34257,28395,48237


In [None]:
prob_dying.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Infant mortality rate (probability of dying between birth and age 1 per 1000 live births),Infant mortality rate (probability of dying between birth and age 1 per 1000 live births).1,Infant mortality rate (probability of dying between birth and age 1 per 1000 live births).2,Neonatal mortality rate (per 1000 live births),Under-five mortality rate (probability of dying by age 5 per 1000 live births),Under-five mortality rate (probability of dying by age 5 per 1000 live births).1,Under-five mortality rate (probability of dying by age 5 per 1000 live births).2
0,Country,Year,Both sexes,Male,Female,Both sexes,Both sexes,Male,Female
1,Afghanistan,2018,47.9,51.1,44.5,37.1,62.3,65.7,58.7
2,Afghanistan,2017,49.5,52.7,46,38.1,64.7,68.1,61.1
3,Afghanistan,2016,51.2,54.5,47.7,39.3,67.5,70.9,63.7
4,Afghanistan,2015,53.1,56.5,49.6,40.5,70.4,73.8,66.7


We can see that we can likely to combine dataset via country and year. 

Also Row 0 is a subrow which splits Both sexes, Males and Females 

In [None]:
df_list = [adult_mortality, maternal_mortality, num_death, prob_dying]

for df in df_list:
    df.columns = [df.iloc[0], df.iloc[1]]
    df = df[2:]

## Check for missing values

In [None]:
adult_mortality.isnull().sum()

Unnamed: 0                                                                                   0
Unnamed: 1                                                                                   0
Adult mortality rate (probability of dying between 15 and 60 years per 1000 population)      0
Adult mortality rate (probability of dying between 15 and 60 years per 1000 population).1    0
Adult mortality rate (probability of dying between 15 and 60 years per 1000 population).2    0
dtype: int64

In [None]:
maternal_mortality.isnull().sum()

Country                                               0
Year                                                  0
Maternal mortality ratio (per 100 000 live births)    0
Number of maternal deaths                             0
dtype: int64

In [None]:
num_death.isnull().sum()

Unnamed: 0                                   0
Unnamed: 1                                   0
Number of under-five deaths (thousands)      0
Number of under-five deaths (thousands).1    0
Number of under-five deaths (thousands).2    0
Number of infant deaths (thousands)          0
Number of infant deaths (thousands).1        0
Number of infant deaths (thousands).2        0
Number of neonatal deaths (thousands)        0
dtype: int64

In [None]:
prob_dying.isnull().sum()

Unnamed: 0                                                                                     0
Unnamed: 1                                                                                     0
Infant mortality rate (probability of dying between birth and age 1 per 1000 live births)      0
Infant mortality rate (probability of dying between birth and age 1 per 1000 live births).1    0
Infant mortality rate (probability of dying between birth and age 1 per 1000 live births).2    0
Neonatal mortality rate (per 1000 live births)                                                 0
Under-five mortality rate (probability of dying by age 5 per 1000 live births)                 0
Under-five mortality rate (probability of dying by age 5 per 1000 live births).1               0
Under-five mortality rate (probability of dying by age 5 per 1000 live births).2               0
dtype: int64

No missing values 

## 