In [1]:
import pandas as pd
import functools as fn 

In [350]:
!git clone https://github.com/nghi-huynh/BigDataChallenge2022.git

Cloning into 'BigDataChallenge2022'...
remote: Enumerating objects: 63, done.[K
remote: Counting objects: 100% (63/63), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 63 (delta 17), reused 51 (delta 8), pack-reused 0[K
Unpacking objects: 100% (63/63), done.


In [352]:
!mv /content/BigDataChallenge2022/raw_data .
!mv /content/BigDataChallenge2022/merged_data .
!rm -r /content/BigDataChallenge2022

## Helper functions

In [191]:
# melt dataframes based on id columns
def melt(df,id_vars, var_name, value_name):
  tmp = df.melt(id_vars=id_vars,var_name=var_name,value_name=value_name)
  return tmp

In [192]:
# drop columns and melt dataframe 
def preprocessing(df, cols_drop, melt_vars):
  tmp = df.drop(cols_drop, axis=1)
  tmp = melt(tmp, melt_vars[0], melt_vars[1], melt_vars[2])
  return tmp

In [296]:
# merge data frames
# specify left on, right on, and how
def merge_data(dataframes, left_on, right_on, how):
  tmp = fn.reduce(lambda  left,right: pd.merge(left,right,left_on=left_on,
                                                   right_on=right_on,
                                                   how=how), dataframes).fillna('NaN')
                                            
  return tmp

Since our raw data have two types of structure, I want to separate them into 2 parts and merge them based on their current structure first. Then, I re-structured the merged dataframes and merged all of them together.

## Merge deaths related to mental disorders

In [216]:
mental_substance_death = pd.read_csv("/content/raw_data/death-rates-from-mental-and-substance-disorders-by-age.csv")

In [217]:
mental_substance_death.columns

Index(['Entity', 'Code', 'Year',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: Under 5 (Rate)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: 70+ years (Rate)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: 5-14 years (Rate)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: 15-49 years (Rate)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: 50-69 years (Rate)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: All Ages (Rate)'],
      dtype='object')

In [218]:
# Select columns we want to keep
cols = ['Entity','Year','Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)']

In [222]:
mental_substance_death_final = mental_substance_death[cols]

In [223]:
mental_substance_death_final

Unnamed: 0,Entity,Year,Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)
0,Afghanistan,1990,0.144067
1,Afghanistan,1991,0.140653
2,Afghanistan,1992,0.135279
3,Afghanistan,1993,0.133047
4,Afghanistan,1994,0.130073
...,...,...,...
6463,Zimbabwe,2013,1.379243
6464,Zimbabwe,2014,1.398058
6465,Zimbabwe,2015,1.413464
6466,Zimbabwe,2016,1.445058


In [228]:
mental_death = pd.read_csv("/content/raw_data/share-deaths-suicide.csv")
mental_death = mental_death.drop(["Code"], axis=1)
mental_death.head()

Unnamed: 0,Entity,Year,Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)
0,Afghanistan,1990,0.381465
1,Afghanistan,1991,0.388646
2,Afghanistan,1992,0.409441
3,Afghanistan,1993,0.41803
4,Afghanistan,1994,0.409779


In [229]:
self_harm_death = pd.read_csv("/content/raw_data/share-with-mental-and-substance-disorders.csv")
self_harm_death = self_harm_death.drop(["Code"], axis=1)
self_harm_death.head()

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent)
0,Afghanistan,1990,16.659229
1,Afghanistan,1991,16.765052
2,Afghanistan,1992,16.874469
3,Afghanistan,1993,16.99072
4,Afghanistan,1994,17.11273


**=>** Now, we have a set of dataframes with similar stucture to merge.

In [231]:
# compile the list of dataframes we want to merge
data_frames = [self_harm_death, mental_death, mental_substance_death_final]
left_on = ['Entity', 'Year']
right_on = ['Entity', 'Year']
how = 'left'

In [232]:
first_merged = merge_data(data_frames, left_on, right_on, how)

In [234]:
first_merged

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent),Deaths - Self-harm - Sex: Both - Age: All Ages (Percent),Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)
0,Afghanistan,1990,16.659229,0.381465,0.144067
1,Afghanistan,1991,16.765052,0.388646,0.140653
2,Afghanistan,1992,16.874469,0.409441,0.135279
3,Afghanistan,1993,16.990720,0.41803,0.133047
4,Afghanistan,1994,17.112730,0.409779,0.130073
...,...,...,...,...,...
6895,Zimbabwe,2015,11.156429,1.676438,1.413464
6896,Zimbabwe,2016,11.164133,1.747906,1.445058
6897,Zimbabwe,2017,11.170427,1.804474,1.47002
6898,Zimbabwe,2018,11.158765,1.87043,


In [244]:
first_merged.dtypes

Entity                                                                                     object
Year                                                                                        int64
Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent)               float64
Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)                                   object
Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)     object
dtype: object

In [345]:
# Save first merged to csv file
first_merged.to_csv('./first_merge_data.csv', index=False)

## Merge suicide rates, gdp, unemployment rate, and income

In [319]:
# read csv without index Unnamed column
# first merged csv
first_merged = pd.read_csv("/content/first_merged.csv")
first_merged.head()

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent),Deaths - Self-harm - Sex: Both - Age: All Ages (Percent),Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)
0,Afghanistan,1990,16.659229,0.381465,0.144067
1,Afghanistan,1991,16.765052,0.388646,0.140653
2,Afghanistan,1992,16.874469,0.409441,0.135279
3,Afghanistan,1993,16.99072,0.41803,0.133047
4,Afghanistan,1994,17.11273,0.409779,0.130073


### Suicide rate

In [268]:
suicide_rate = pd.read_csv("/content/raw_data/suicide_rate_country_age_standardized.csv")

In [269]:
# filter only both sexes 
suicide_both_sexes = suicide_rate.loc[suicide_rate['Sex'] == 'Both sexes',:].drop(['Sex'], axis=1)

In [270]:
suicide_both_sexes.head()

Unnamed: 0,Country,2019,2018,2017,2016,2015,2014,2013,2012,2011,...,2009,2008,2007,2006,2005,2004,2003,2002,2001,2000
0,Afghanistan,6.0 [3.4-9.9],5.9 [3.4-9.8],6.0 [3.4-9.9],6.0 [3.5-9.9],6.0 [3.5-9.9],6.0 [3.5-9.9],6.2 [3.6-10.2],6.2 [3.6-10.2],6.4 [3.8-10.5],...,6.8 [4.0-11.1],7.2 [4.2-11.7],7.4 [4.3-12.1],7.6 [4.4-12.3],7.6 [4.4-12.3],7.8 [4.5-12.6],7.7 [4.5-12.5],7.9 [4.5-12.8],7.9 [4.5-12.8],7.7 [4.4-12.5]
3,Albania,3.7 [2.1-5.7],3.9 [2.2-6.0],4.1 [2.3-6.3],4.2 [2.3-6.3],4.2 [2.3-6.4],4.5 [2.5-6.7],4.8 [2.7-7.1],4.8 [2.8-7.0],7.6 [4.7-10.7],...,8.0 [4.8-10.8],8.1 [5.0-10.8],8.1 [5.0-10.6],7.8 [5.0-10.2],7.7 [4.9-10.0],4.9 [3.1-6.3],4.9 [3.3-6.4],4.8 [3.2-6.3],4.7 [3.2-6.2],5.2 [3.6-6.9]
6,Algeria,2.6 [1.4-4.4],2.6 [1.4-4.4],2.5 [1.4-4.4],2.6 [1.4-4.5],2.7 [1.5-4.7],2.8 [1.5-4.8],2.9 [1.5-5.0],2.9 [1.6-5.0],2.9 [1.6-5.1],...,3.2 [1.7-5.6],3.3 [1.8-5.8],3.5 [1.8-6.0],3.7 [2.0-6.4],3.8 [2.0-6.6],4.0 [2.1-6.9],4.1 [2.2-7.1],4.4 [2.4-7.6],4.6 [2.4-7.9],4.7 [2.5-8.0]
9,Angola,12.6 [7.7-19.4],12.4 [7.6-19.0],12.4 [7.6-18.9],12.9 [8.0-19.6],13.3 [8.3-20.1],13.1 [8.3-19.8],14.2 [9.0-21.2],14.2 [9.0-21.2],13.5 [8.6-20.2],...,13.6 [8.7-20.5],15.3 [9.7-23.0],15.0 [9.5-22.6],17.1 [10.8-25.6],16.3 [10.3-24.4],17.2 [10.8-25.9],17.5 [10.8-26.6],17.2 [10.7-26.3],17.5 [10.6-26.7],17.6 [10.5-27.0]
12,Antigua and Barbuda,0.3 [0.2-0.5],0.3 [0.2-0.5],0.0 [0.0-0.0],0.5 [0.3-0.7],0.4 [0.3-0.6],0.4 [0.3-0.6],0.0 [0.0-0.0],0.0 [0.0-0.0],0.2 [0.1-0.2],...,0.0 [0.0-0.0],0.2 [0.2-0.3],0.3 [0.2-0.5],0.4 [0.3-0.6],1.3 [0.9-1.8],2.0 [1.4-2.7],1.4 [1.0-2.0],1.3 [0.9-1.7],1.9 [1.3-2.6],2.0 [1.4-2.7]


In [271]:
cols_drop = []
melts_vars = [["Country"], "Year", "Suicide rate"]

In [272]:
suicide_both_sexes_final = preprocessing(suicide_both_sexes, cols_drop, melts_vars)

In [274]:
suicide_both_sexes_final.loc[suicide_both_sexes_final['Country'] == 'Albania',:]

Unnamed: 0,Country,Year,Suicide rate
1,Albania,2019,3.7 [2.1-5.7]
184,Albania,2018,3.9 [2.2-6.0]
367,Albania,2017,4.1 [2.3-6.3]
550,Albania,2016,4.2 [2.3-6.3]
733,Albania,2015,4.2 [2.3-6.4]
916,Albania,2014,4.5 [2.5-6.7]
1099,Albania,2013,4.8 [2.7-7.1]
1282,Albania,2012,4.8 [2.8-7.0]
1465,Albania,2011,7.6 [4.7-10.7]
1648,Albania,2010,7.6 [4.6-10.5]


In [275]:
suicide_both_sexes_final["Year"] = suicide_both_sexes_final["Year"].astype("int64")

In [276]:
suicide_both_sexes_final.dtypes

Country         object
Year             int64
Suicide rate    object
dtype: object

### GDP (current, per capita)

In [250]:
gdp_current = pd.read_csv("/content/raw_data/gdp_current.csv")

In [251]:
cols_drop = ['Country Code', 'Indicator Name', 'Indicator Code']
melts_var = [["Country Name"], "Year", "gdp_current"]

In [252]:
gdp_current_final = preprocessing(gdp_current, cols_drop, melts_var)

In [253]:
gdp_current_final

Unnamed: 0,Country Name,Year,gdp_current
0,Aruba,1960,
1,Africa Eastern and Southern,1960,2.008272e+10
2,Afghanistan,1960,5.377778e+08
3,Africa Western and Central,1960,1.040428e+10
4,Angola,1960,
...,...,...,...
16487,Kosovo,2021,
16488,"Yemen, Rep.",2021,
16489,South Africa,2021,
16490,Zambia,2021,


In [260]:
gdp_current_final['Year'] = gdp_current_final['Year'].astype("int64")

In [256]:
gdp_per_capita = pd.read_csv("/content/raw_data/gdp_per_capita.csv")

In [257]:
cols_drop = ['Country Code', 'Indicator Name', 'Indicator Code']
melts_var = [["Country Name"], "Year", "gdp_per_capita"]

In [258]:
gdp_per_capita_final = preprocessing(gdp_per_capita, cols_drop, melts_var)

In [259]:
gdp_per_capita_final

Unnamed: 0,Country Name,Year,gdp_per_capita
0,Aruba,1960,
1,Africa Eastern and Southern,1960,153.494439
2,Afghanistan,1960,59.773234
3,Africa Western and Central,1960,107.932233
4,Angola,1960,
...,...,...,...
16487,Kosovo,2021,
16488,"Yemen, Rep.",2021,
16489,South Africa,2021,
16490,Zambia,2021,


In [261]:
gdp_per_capita_final['Year'] = gdp_per_capita_final['Year'].astype("int64")

In [262]:
gdp_per_capita_final.dtypes

Country Name       object
Year                int64
gdp_per_capita    float64
dtype: object

In [263]:
gdp_current_final.dtypes

Country Name     object
Year              int64
gdp_current     float64
dtype: object

In [278]:
gdp_per_capita_final.rename(columns={'Country Name': 'Country'}, inplace=True)
gdp_current_final.rename(columns={'Country Name': 'Country'}, inplace=True)

### Unemployment

In [300]:
unemployment = pd.read_csv("/content/raw_data/unemployment_rate.csv")

In [301]:
cols_drop = ["Country Code", "Indicator Name", "Indicator Code"]
melts_var = [["Country Name"], "Year", "unemployment rate"]

In [303]:
unemployment_final = preprocessing(unemployment, cols_drop, melts_var)
unemployment_final

Unnamed: 0,Country Name,Year,unemployment rate
0,Aruba,1960,
1,Africa Eastern and Southern,1960,
2,Afghanistan,1960,
3,Africa Western and Central,1960,
4,Angola,1960,
...,...,...,...
16487,Kosovo,2021,
16488,"Yemen, Rep.",2021,13.574000
16489,South Africa,2021,33.558998
16490,Zambia,2021,13.026000


In [304]:
unemployment_final["Year"] = unemployment_final["Year"].astype("int64")

In [305]:
unemployment_final.dtypes

Country Name          object
Year                   int64
unemployment rate    float64
dtype: object

### Health Expenditure (gdp, per capita)

In [306]:
expenditure_gdp = pd.read_csv("/content/raw_data/health_expenditure_gdp.csv")

In [307]:
cols_drop = ["Country Code", "Indicator Name", "Indicator Code"]
melts_vars = [["Country Name"], "Year", "health expenditure (% GDP)"]

In [308]:
expenditure_gdp_final = preprocessing(expenditure_gdp,cols_drop,melts_vars)

In [309]:
expenditure_gdp_final.loc[expenditure_gdp_final["Country Name"] == "France"]

Unnamed: 0,Country Name,Year,health expenditure (% GDP)
77,France,1960,
343,France,1961,
609,France,1962,
875,France,1963,
1141,France,1964,
...,...,...,...
15239,France,2017,11.332762
15505,France,2018,11.185830
15771,France,2019,11.057472
16037,France,2020,


In [310]:
expenditure_gdp_final["Year"] = expenditure_gdp_final["Year"].astype("int64")

In [311]:
expenditure_capita = pd.read_csv("/content/raw_data/gdp_per_capita.csv")


In [312]:
cols_drop = ["Country Code", "Indicator Name", "Indicator Code"]
melts_vars = [["Country Name"], "Year", "health expenditure (per capita)"]

In [313]:
expenditure_capita_final = preprocessing(expenditure_capita, cols_drop, melts_vars)

In [314]:
expenditure_capita_final

Unnamed: 0,Country Name,Year,health expenditure (per capita)
0,Aruba,1960,
1,Africa Eastern and Southern,1960,153.494439
2,Afghanistan,1960,59.773234
3,Africa Western and Central,1960,107.932233
4,Angola,1960,
...,...,...,...
16487,Kosovo,2021,
16488,"Yemen, Rep.",2021,
16489,South Africa,2021,
16490,Zambia,2021,


In [315]:
expenditure_capita_final["Year"] = expenditure_capita_final["Year"].astype("int64")

In [316]:
unemployment_final.rename(columns={'Country Name': 'Country'}, inplace=True)
expenditure_capita_final.rename(columns={'Country Name': 'Country'}, inplace=True)
expenditure_gdp_final.rename(columns={'Country Name': 'Country'}, inplace=True)

### Income group

In [323]:
income = pd.read_csv("/content/raw_data/data_income.csv")

In [324]:
income.head()

Unnamed: 0,Country,Income group,Region,Lending category,1987,1988,1989,1990,1991,1992,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,Aruba,High income,Latin America & Caribbean,,,10360.0,11760.0,12230.0,13190.0,13990.0,...,22450.0,23520.0,24510.0,25350.0,26560.0,26840.0,27120.0,,,
1,Afghanistan,Low income,South Asia,IDA,,,,,,,...,530.0,630.0,660.0,630.0,600.0,550.0,530.0,520.0,530.0,500.0
2,Angola,Lower middle income,Sub-Saharan Africa,IBRD,670.0,650.0,860.0,780.0,1380.0,1170.0,...,3410.0,4170.0,4780.0,5010.0,4520.0,3770.0,3450.0,3210.0,2970.0,2230.0
3,Albania,Upper middle income,Europe & Central Asia,IBRD,730.0,730.0,760.0,650.0,410.0,280.0,...,4410.0,4360.0,4540.0,4540.0,4390.0,4320.0,4290.0,4860.0,5220.0,5210.0
4,Andorra,High income,Europe & Central Asia,,,,,,,,...,,,,,,,,,,


In [325]:
cols_drop = ["Region", "Lending category"]
melts_vars = [["Country","Income group"], "Year", "Income"]

In [326]:
income_final = preprocessing(income, cols_drop, melts_vars)

In [327]:
income_final

Unnamed: 0,Country,Income group,Year,Income
0,Aruba,High income,1987,
1,Afghanistan,Low income,1987,
2,Angola,Lower middle income,1987,670.0
3,Albania,Upper middle income,1987,730.0
4,Andorra,High income,1987,
...,...,...,...,...
7373,Kosovo,Upper middle income,2020,4440.0
7374,"Yemen, Rep.",Low income,2020,
7375,South Africa,Upper middle income,2020,5410.0
7376,Zambia,Lower middle income,2020,1190.0


In [328]:
income_final["Year"] = income_final["Year"].astype("int64")

In [329]:
income_final.dtypes

Country          object
Income group     object
Year              int64
Income          float64
dtype: object

### Second merge

In [337]:
df_merge = [first_merged, gdp_current_final, gdp_per_capita_final, expenditure_capita_final,expenditure_gdp_final, unemployment_final, income_final, suicide_both_sexes_final]
left_on=['Entity', 'Year']
right_on=['Country','Year']
how='left'

In [338]:
second_merge = merge_data(df_merge, left_on, right_on, how)

  


In [339]:
second_merge

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent),Deaths - Self-harm - Sex: Both - Age: All Ages (Percent),Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate),Country_x,gdp_current,Country_y,gdp_per_capita,Country_x.1,health expenditure (per capita),Country_y.1,health expenditure (% GDP),Country_x.2,unemployment rate,Country_y.2,Income group,Income,Country,Suicide rate
0,Afghanistan,1990,16.659229,0.381465,0.144067,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,Low income,,,
1,Afghanistan,1991,16.765052,0.388646,0.140653,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,10.649,Afghanistan,Low income,,,
2,Afghanistan,1992,16.874469,0.409441,0.135279,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,10.821,Afghanistan,Low income,,,
3,Afghanistan,1993,16.990720,0.41803,0.133047,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,10.723,Afghanistan,Low income,,,
4,Afghanistan,1994,17.112730,0.409779,0.130073,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,,Afghanistan,10.726,Afghanistan,Low income,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6895,Zimbabwe,2015,11.156429,1.676438,1.413464,Zimbabwe,19963120600.0,Zimbabwe,1445.069702,Zimbabwe,1445.069702,Zimbabwe,7.452066,Zimbabwe,4.778,Zimbabwe,Lower middle income,1280.0,Zimbabwe,30.7 [16.6-48.3]
6896,Zimbabwe,2016,11.164133,1.747906,1.445058,Zimbabwe,20548678100.0,Zimbabwe,1464.588957,Zimbabwe,1464.588957,Zimbabwe,7.675163,Zimbabwe,4.788,Zimbabwe,Lower middle income,1290.0,Zimbabwe,28.7 [15.4-45.3]
6897,Zimbabwe,2017,11.170427,1.804474,1.47002,Zimbabwe,17584890937.0,Zimbabwe,1235.189032,Zimbabwe,1235.189032,Zimbabwe,7.469752,Zimbabwe,4.785,Zimbabwe,Lower middle income,1390.0,Zimbabwe,25.9 [13.8-40.9]
6898,Zimbabwe,2018,11.158765,1.87043,,Zimbabwe,18115543791.0,Zimbabwe,1254.642265,Zimbabwe,1254.642265,Zimbabwe,8.680062,Zimbabwe,4.796,Zimbabwe,Lower middle income,1410.0,Zimbabwe,23.9 [12.7-37.8]


In [340]:
cols_drop = ['Country_x', 'Country_y', 'Country']

In [341]:
second_merge.drop(columns=cols_drop, axis=1, inplace=True)

In [347]:
second_merge.head()

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent),Deaths - Self-harm - Sex: Both - Age: All Ages (Percent),Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate),gdp_current,gdp_per_capita,health expenditure (per capita),health expenditure (% GDP),unemployment rate,Income group,Income,Suicide rate
0,Afghanistan,1990,16.659229,0.381465,0.144067,,,,,,Low income,,
1,Afghanistan,1991,16.765052,0.388646,0.140653,,,,,10.649,Low income,,
2,Afghanistan,1992,16.874469,0.409441,0.135279,,,,,10.821,Low income,,
3,Afghanistan,1993,16.99072,0.41803,0.133047,,,,,10.723,Low income,,
4,Afghanistan,1994,17.11273,0.409779,0.130073,,,,,10.726,Low income,,


In [344]:
second_merge.to_csv("second_merge_data.csv", index=False)

## Merge primary care expenditure, disability

In [349]:
second_merge.head()

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent),Deaths - Self-harm - Sex: Both - Age: All Ages (Percent),Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate),gdp_current,gdp_per_capita,health expenditure (per capita),health expenditure (% GDP),unemployment rate,Income group,Income,Suicide rate
0,Afghanistan,1990,16.659229,0.381465,0.144067,,,,,,Low income,,
1,Afghanistan,1991,16.765052,0.388646,0.140653,,,,,10.649,Low income,,
2,Afghanistan,1992,16.874469,0.409441,0.135279,,,,,10.821,Low income,,
3,Afghanistan,1993,16.99072,0.41803,0.133047,,,,,10.723,Low income,,
4,Afghanistan,1994,17.11273,0.409779,0.130073,,,,,10.726,Low income,,


In [357]:
disability = pd.read_csv("/content/raw_data/mental-and-substance-use-as-share-of-disease.csv")
disability = disability.drop(columns=['Code'], axis=1)
disability.head()

Unnamed: 0,Entity,Year,DALYs (Disability-Adjusted Life Years) - Mental disorders - Sex: Both - Age: All Ages (Percent)
0,Afghanistan,1990,1.69667
1,Afghanistan,1991,1.734281
2,Afghanistan,1992,1.791189
3,Afghanistan,1993,1.776779
4,Afghanistan,1994,1.712986


In [358]:
disability.dtypes

Entity                                                                                              object
Year                                                                                                 int64
DALYs (Disability-Adjusted Life Years) - Mental disorders - Sex: Both - Age: All Ages (Percent)    float64
dtype: object

In [361]:
data_frames = [second_merge, disability]
left_on = ['Entity', 'Year']
right_on = ['Entity', 'Year']
how = 'left'

In [362]:
final_merge = merge_data(data_frames, left_on, right_on, how)

In [365]:
final_merge.columns

Index(['Entity', 'Year',
       'Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent)',
       'Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)',
       'gdp_current', 'gdp_per_capita', 'health expenditure (per capita)',
       'health expenditure (% GDP)', 'unemployment rate', 'Income group',
       'Income', 'Suicide rate',
       'DALYs (Disability-Adjusted Life Years) - Mental disorders - Sex: Both - Age: All Ages (Percent)'],
      dtype='object')

In [366]:
# reorder columns
cols_order = ['Entity', 'Year',
       'Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent)',
       'Deaths - Self-harm - Sex: Both - Age: All Ages (Percent)',
       'Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate)',
       'gdp_current', 'gdp_per_capita', 'health expenditure (per capita)',
       'health expenditure (% GDP)', 'unemployment rate', 'Income group',
       'Income',
       'DALYs (Disability-Adjusted Life Years) - Mental disorders - Sex: Both - Age: All Ages (Percent)',
       'Suicide rate']

In [367]:
final_merge = final_merge[cols_order]

In [368]:
final_merge

Unnamed: 0,Entity,Year,Prevalence - Mental disorders - Sex: Both - Age: Age-standardized (Percent),Deaths - Self-harm - Sex: Both - Age: All Ages (Percent),Deaths - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Rate),gdp_current,gdp_per_capita,health expenditure (per capita),health expenditure (% GDP),unemployment rate,Income group,Income,DALYs (Disability-Adjusted Life Years) - Mental disorders - Sex: Both - Age: All Ages (Percent),Suicide rate
0,Afghanistan,1990,16.659229,0.381465,0.144067,,,,,,Low income,,1.69667,
1,Afghanistan,1991,16.765052,0.388646,0.140653,,,,,10.649,Low income,,1.734281,
2,Afghanistan,1992,16.874469,0.409441,0.135279,,,,,10.821,Low income,,1.791189,
3,Afghanistan,1993,16.990720,0.41803,0.133047,,,,,10.723,Low income,,1.776779,
4,Afghanistan,1994,17.112730,0.409779,0.130073,,,,,10.726,Low income,,1.712986,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6895,Zimbabwe,2015,11.156429,1.676438,1.413464,19963120600.0,1445.069702,1445.069702,7.452066,4.778,Lower middle income,1280.0,2.193166,30.7 [16.6-48.3]
6896,Zimbabwe,2016,11.164133,1.747906,1.445058,20548678100.0,1464.588957,1464.588957,7.675163,4.788,Lower middle income,1290.0,2.279813,28.7 [15.4-45.3]
6897,Zimbabwe,2017,11.170427,1.804474,1.47002,17584890937.0,1235.189032,1235.189032,7.469752,4.785,Lower middle income,1390.0,2.364265,25.9 [13.8-40.9]
6898,Zimbabwe,2018,11.158765,1.87043,,18115543791.0,1254.642265,1254.642265,8.680062,4.796,Lower middle income,1410.0,2.472949,23.9 [12.7-37.8]


In [370]:
final_merge.to_csv("./final_merge_data.csv")