# Import 2 databases as dataframes 
## Merge 2 dataframes to become a 3rd working dataframe

- Check the structure of each dataframe before merge - ensure there is a column to facilitate the merge by changing the title of a column to ensure a match
- Check shape of dataframe before & after merge, eg. columns, names, nulls, duplicates etc

In [1]:
import pandas as pd

In [2]:
df1 = pd.read_excel("Age standardised mortality rates 2009,2014, 2019 for regression.xlsx")

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38556 entries, 0 to 38555
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   location_name  38556 non-null  object 
 1   sex_id         38556 non-null  int64  
 2   sex_name       38556 non-null  object 
 3   age_id         38556 non-null  int64  
 4   age_name       38556 non-null  object 
 5   cause_id       38556 non-null  int64  
 6   cause_name     38556 non-null  object 
 7   year           38556 non-null  int64  
 8   val            38556 non-null  float64
 9   upper          38556 non-null  float64
 10  lower          38556 non-null  float64
dtypes: float64(3), int64(4), object(4)
memory usage: 3.2+ MB


In [5]:
df1.rename(columns={"location_name":"country", "sex_name":"sex", "cause_name": "cause", "val":"mortality_rate", "upper":"upper_rate_estimate", "lower": "lower_rate_estimate"}, inplace=True)

In [6]:
df1.drop(columns={'sex_id', 'age_id', 'cause_id',"age_name"}, inplace=True)

In [7]:
df1.head()

Unnamed: 0,country,sex,cause,year,mortality_rate,upper_rate_estimate,lower_rate_estimate
0,Islamic Republic of Pakistan,Male,Cardiovascular diseases,2009,413.147249,488.874963,340.744313
1,Islamic Republic of Pakistan,Female,Cardiovascular diseases,2009,363.865438,439.998403,305.686681
2,Islamic Republic of Pakistan,Both,Cardiovascular diseases,2009,389.67772,439.846769,343.996314
3,Islamic Republic of Pakistan,Male,Musculoskeletal disorders,2009,2.252683,2.905868,1.765492
4,Islamic Republic of Pakistan,Female,Musculoskeletal disorders,2009,3.202923,4.319955,2.332261


In [None]:
# Country names remame 
# "American Samoa": "Samoa", "Bermuda", ""

In [8]:
df2 = pd.read_excel("2. Global health expenditure 2019.xlsx")

In [None]:
# df3 = pd.read_csv("2. Global health expenditure 2019.csv") 
# not sure why, but reading in df as csv doesn't work here

# UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf4 in position 1699: invalid continuation byte

In [9]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192 entries, 0 to 191
Data columns (total 23 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   country       192 non-null    object 
 1   code          192 non-null    object 
 2   region        192 non-null    object 
 3   income        192 non-null    object 
 4   year          192 non-null    int64  
 5   che_gdp       189 non-null    float64
 6   che_pc_usd    189 non-null    float64
 7   che           189 non-null    float64
 8   gghed_pc_usd  189 non-null    float64
 9   pvtd_pc_usd   189 non-null    float64
 10  oop_pc_usd    189 non-null    float64
 11  ext_pc_usd    169 non-null    float64
 12  gdp_pc_usd    189 non-null    float64
 13  gdp           189 non-null    float64
 14  gge           189 non-null    float64
 15  ppp           189 non-null    float64
 16  xrt           192 non-null    float64
 17  gdpd          192 non-null    float64
 18  pop           189 non-null    

In [10]:
# Assuming df1 and df2 are your DataFrames
# Merge df1 and df2 on the 'country' column
merged_df = pd.merge(df1, df2, on='country', how='left')

# Forward fill the NaN values in the additional columns from df2
additional_columns = df2.columns.difference(df1.columns)  # Get additional columns from df2
merged_df[additional_columns] = merged_df.groupby('country')[additional_columns].ffill()

# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.join.html

In [11]:
merged_df

Unnamed: 0,country,sex,cause,year_x,mortality_rate,upper_rate_estimate,lower_rate_estimate,code,region,income,...,gdp,gge,ppp,xrt,gdpd,pop,che_usd,gghed_usd,pvtd_usd,ext_usd
0,Islamic Republic of Pakistan,Male,Cardiovascular diseases,2009,413.147249,488.874963,340.744313,,,,...,,,,,,,,,,
1,Islamic Republic of Pakistan,Female,Cardiovascular diseases,2009,363.865438,439.998403,305.686681,,,,...,,,,,,,,,,
2,Islamic Republic of Pakistan,Both,Cardiovascular diseases,2009,389.677720,439.846769,343.996314,,,,...,,,,,,,,,,
3,Islamic Republic of Pakistan,Male,Musculoskeletal disorders,2009,2.252683,2.905868,1.765492,,,,...,,,,,,,,,,
4,Islamic Republic of Pakistan,Female,Musculoskeletal disorders,2009,3.202923,4.319955,2.332261,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
38551,Republic of Sudan,Female,Substance use disorders,2019,0.989857,1.426803,0.651659,,,,...,,,,,,,,,,
38552,Republic of Sudan,Both,Substance use disorders,2019,1.573392,2.171834,1.051616,,,,...,,,,,,,,,,
38553,Republic of Sudan,Male,Diabetes and kidney diseases,2019,45.456185,70.969259,30.635657,,,,...,,,,,,,,,,
38554,Republic of Sudan,Female,Diabetes and kidney diseases,2019,45.222162,62.064179,32.898755,,,,...,,,,,,,,,,


In [12]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38556 entries, 0 to 38555
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country              38556 non-null  object 
 1   sex                  38556 non-null  object 
 2   cause                38556 non-null  object 
 3   year_x               38556 non-null  int64  
 4   mortality_rate       38556 non-null  float64
 5   upper_rate_estimate  38556 non-null  float64
 6   lower_rate_estimate  38556 non-null  float64
 7   code                 7560 non-null   object 
 8   region               7560 non-null   object 
 9   income               7560 non-null   object 
 10  year_y               7560 non-null   float64
 11  che_gdp              7371 non-null   float64
 12  che_pc_usd           7371 non-null   float64
 13  che                  7371 non-null   float64
 14  gghed_pc_usd         7371 non-null   float64
 15  pvtd_pc_usd          7371 non-null  

In [14]:
merged_df.to_csv('~/Desktop/Merged global age standardized mortality 2009, 2014,2019 & country info.csv', index=False)