#import libraries

In [346]:
import pandas as pd 
import numpy as np
from scipy import stats #for outlier detection based on standard deviations
import matplotlib.pyplot as plt #likley won't be used much as i'm experimenting with plotly 
import plotly.graph_objects as go #you will be learning how go and px work with me! 
import plotly.express as px 
import seaborn as sns

#Import data, describe,preview; and check nulls

In [347]:
#current directory Users\GDA\Portfolio\WHO_Stats
df = pd.read_csv('WHO_MHExp_and_Deaths.csv')
df.shape


(531, 9)

In [348]:
df.describe()
#df.dtypes

Unnamed: 0,Year,Population,Deaths_All_Types,Deaths_Suicides,HExp_Pctage_Y,MHExp_Pctage_2011,Dep_Num_2015,Suicide_p100
count,531.0,531.0,531.0,531.0,531.0,531.0,531.0,531.0
mean,2009.694915,25351480.0,432276.3,259.659134,7.555631,5.731205,4.890603,10.832768
std,5.572266,32854630.0,585679.9,1075.051391,2.103642,2.95233,0.605882,5.491425
min,2000.0,281154.0,836.0,0.0,1.99,0.39,2.9,0.82
25%,2005.0,3553031.0,57336.0,0.0,6.1,3.86,4.57,6.415
50%,2010.0,9696110.0,184150.0,2.0,7.9,5.08,5.06,10.24
75%,2014.0,38115940.0,741164.0,42.5,9.07,7.64,5.19,14.495
max,2019.0,210147100.0,2762186.0,20031.0,13.68,12.91,5.94,29.63


In [349]:
df.head()

Unnamed: 0,Country_Name,Year,Population,Deaths_All_Types,Deaths_Suicides,HExp_Pctage_Y,MHExp_Pctage_2011,Dep_Num_2015,Suicide_p100
0,Armenia,2006,3219235.0,54404,78,5.85,3.38,4.99,5.36
1,Armenia,2007,3107395.5,53660,0,5.5,3.38,4.99,5.02
2,Armenia,2008,3234031.0,54824,63,6.64,3.38,4.99,4.23
3,Armenia,2009,3066044.0,55120,53,8.44,3.38,4.99,4.06
4,Armenia,2012,3024127.0,55200,79,9.13,3.38,4.99,6.39


In [350]:
#check for percentage of nulls
df.isnull().sum() /df.shape[0]

Country_Name         0.0
Year                 0.0
Population           0.0
Deaths_All_Types     0.0
Deaths_Suicides      0.0
HExp_Pctage_Y        0.0
MHExp_Pctage_2011    0.0
Dep_Num_2015         0.0
Suicide_p100         0.0
dtype: float64

#Plotly and Graphs

In [413]:
# let's see what is the mental health (MH) expenditure distribution according to the 2011 index, and the depression estimate in 2015
df=df.sort_values('MHExp_Pctage_2011',ascending=True)
fig=go.Figure()
fig.add_trace(go.Histogram(x=df['Country_Name'],y=df['MHExp_Pctage_2011'],histfunc='avg',name='MHExp_2011'))
fig.add_trace(go.Histogram(x=df['Country_Name'],y=df['Dep_Num_2015'],histfunc='avg',name='Dep_Prev_2015'))
fig.update_layout(barmode='overlay')
fig.update_traces(opacity=0.75)
fig.show()

#check relation between discrete and continuous variables: MHExp_2011 and HExp_Pctage | Dep_Num_2015 and suicide rates p100,000 population
fig2=go.Figure()
fig2.add_trace(go.Histogram(x=df['Country_Name'],y=df['MHExp_Pctage_2011'],histfunc='avg',name='MHExp_2011'))
fig2.add_trace(go.Histogram(x=df['Country_Name'],y=df['HExp_Pctage_Y'],histfunc='avg',name='HExp_Avg_Pctage'))
fig2.update_layout(barmode='overlay')
fig2.update_traces(opacity=0.75)
fig2.show()

df=df.sort_values('Dep_Num_2015',ascending=True)
fig3=go.Figure()
fig3.add_trace(go.Histogram(x=df['Country_Name'],y=df['Dep_Num_2015'],histfunc='avg',name='Dep_Prev_2015'))
fig3.add_trace(go.Histogram(x=df['Country_Name'],y=df['Suicide_p100'],histfunc='avg',name='Suicide_Avg_rate'))
fig3.update_layout(barmode='overlay')
fig3.update_traces(opacity=0.75)
fig3.show()

fig4=go.Figure()
fig4.add_trace(go.Histogram(x=df['Country_Name'],y=df['Suicide_p100'],histfunc='avg',name='Suicide_Avg_rate'))
fig4.add_trace(go.Histogram(x=df['Country_Name'],y=df['HExp_Pctage_Y'],histfunc='avg',name='HExp_Avg_Pctage'))
fig4.update_layout(barmode='overlay')
fig4.update_traces(opacity=0.75)
fig4.show()

##with no clear relation between both discrete index, 2015 prevalence depression looks stable no matter the MH expenditure in 2011
#and also no relation between each discrete index and it's continuous comparison
#but we do find a relation between Health Expenditure and Suicide rates

In [414]:
#now we want so see a heatmap of continuous index: suicide rates (standarized by 100,000 population) and Health Expenditure percentages
fig5 = px.density_heatmap(df,x='HExp_Pctage_Y',y='Suicide_p100')
fig5.show()

In [520]:
#Then, we have to check our MHExp index with deaths by ICD codes (it considers a broad definition of deaths by self harm)
#Before that, we have to check if deaths by ICD codes are similar than the Suicide Rate in WHO

#So,we introduce our ICD_10 calculation with death codes by intentional self-harm

##First, we standarize our ICD_10 calculation, to be of 100,000 population
df['Deaths_Suicides_p100']=df.apply(lambda row: row.Deaths_Suicides*100000 / row.Population, axis=1)

#Then, we compare it with the suicide ratings of WHO, without 0 values of ICD
df2=df.loc[df['Deaths_Suicides_p100']>=1]


fig6 = px.density_heatmap(df2,x='Deaths_Suicides_p100',y='Suicide_p100')
fig6.show()

r=np.corrcoef(df2['Deaths_Suicides_p100'],df2['Suicide_p100'])
r


array([[1.        , 0.96612906],
       [0.96612906, 1.        ]])

In [476]:
#we see a correlation between our self-constructed index based on ICD_10 and the Suicide Rate in WHO

# #Then, comparing the ICD_10 deaths by self harm (average by country and 100,000 Population) with Health Expenditure
fig7 = px.density_heatmap(df2,x='HExp_Pctage_Y',y='Deaths_Suicides_p100')
fig7.show()

In [480]:
#Let's put this in a concise number, studying the correlation between both index, with the original dataframe
r=np.corrcoef(df['Deaths_Suicides_p100'],df['HExp_Pctage_Y'])
r2=np.corrcoef(df['Deaths_Suicides_p100'],df['MHExp_Pctage_2011'])
r,r2

(array([[ 1.       , -0.0935651],
        [-0.0935651,  1.       ]]),
 array([[ 1.       , -0.0407515],
        [-0.0407515,  1.       ]]))

In [419]:
#Not a strong correlation, but shows the negative relationship of less Health Expenditure & Mental Health Expenditure, on higher Deaths by suicide rates (decreasen respectively)

In [481]:
#Let see the relation in a graph
figr=go.Figure(go.Scattergl(x=df['Deaths_Suicides_p100'].value_counts().index,y=df['HExp_Pctage_Y'].values,mode='markers'))
figr.update_layout(title="Level of deaths by suicide in comparison to Health Expenditure")
figr.show()

In [599]:
#Simple linear regression with both indexes
import statsmodels.api as sm
model_dummies=pd.get_dummies(df,columns=['Country_Name','Year'])
#model_dummies.shape

#we define our set of variables
Y= model_dummies.Deaths_Suicides_p100

def inums(indexes, dataframe):
    i_out = [] 
    for i in indexes:
        for j in dataframe.columns:
            if i == j:
                i_out.append(j)
    return dataframe.loc[:,i_out]
        
#create data for questions 2,4,5
questions =  inums(['HExp_Pctage_Y'], model_dummies) ##Population and Death all types have multicollinearity with Deaths by suicides
questions


X =sm.add_constant(questions) #intercept

#fit model with data, and create summary
model = sm.OLS(Y,X)
results=model.fit()
results.summary()

0,1,2,3
Dep. Variable:,Deaths_Suicides_p100,R-squared:,0.009
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,4.672
Date:,"Fri, 02 Sep 2022",Prob (F-statistic):,0.0311
Time:,18:01:05,Log-Likelihood:,-1800.8
No. Observations:,531,AIC:,3606.0
Df Residuals:,529,BIC:,3614.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5.7358,1.166,4.919,0.000,3.445,8.027
HExp_Pctage_Y,-0.3214,0.149,-2.161,0.031,-0.613,-0.029

0,1,2,3
Omnibus:,225.203,Durbin-Watson:,0.789
Prob(Omnibus):,0.0,Jarque-Bera (JB):,664.067
Skew:,2.148,Prob(JB):,6.3100000000000004e-145
Kurtosis:,6.4,Cond. No.,29.7
