In [1]:
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt

I began my analysis of the depression dataset by importing it and running some basic investigations.

In [2]:
depression_df = pd.read_csv('../data/Raw/with-depression.csv')
depression_df.shape

(6468, 4)

In [3]:
depression_df.sample()

Unnamed: 0,Entity,Code,Year,Prevalence - Depressive disorders - Sex: Both - Age: Age-standardized (Percent) (%)
3658,Mauritius,MUS,2008,3.576573


In [4]:
depression_df.columns

Index(['Entity', 'Code', 'Year',
       'Prevalence - Depressive disorders - Sex: Both - Age: Age-standardized (Percent) (%)'],
      dtype='object')

I renamed a column to make it easier to manipulate.

In [5]:
depression_df = depression_df.rename(columns={'Prevalence - Depressive disorders - Sex: Both - Age: Age-standardized (Percent) (%)':'Depression_percent'})

I continued my basic investigations.

In [6]:
depression_df.head()

Unnamed: 0,Entity,Code,Year,Depression_percent
0,Afghanistan,AFG,1990,4.071831
1,Afghanistan,AFG,1991,4.079531
2,Afghanistan,AFG,1992,4.088358
3,Afghanistan,AFG,1993,4.09619
4,Afghanistan,AFG,1994,4.099582


In [7]:
depression_df['Year'].unique()

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017])

In [8]:
depression_df['Entity'].nunique()

231

I imported my continents dataset.

In [9]:
continents_df = pd.read_csv('../data/Raw/countries_continents2.csv')

Then I merged it with the depression dataset. 191 of the 231 entities in the original depression dataset remain in the merged one. However, since this is close to the number of countries in the world, I am satisfied. I care less about the territories listed in the depression dataset.

In [10]:
merged_df = depression_df.merge(continents_df, left_on='Entity', right_on='Country')
merged_df['Entity'].nunique()

191

In [11]:
merged_df['Continent'].unique()

array(['Asia', 'Europe', 'Africa', 'Oceania', 'North America',
       'South America'], dtype=object)

I then pulled the mean, max, and min values of depression percent by continent.

In [12]:
merged_df.groupby('Continent')['Depression_percent'].agg(['mean', 'min','max'])

Unnamed: 0_level_0,mean,min,max
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Africa,3.771239,2.885115,5.744704
Asia,3.376109,2.233269,5.116345
Europe,3.585069,2.139903,5.512826
North America,3.130038,2.535758,6.602754
Oceania,3.321411,2.91334,4.929038
South America,3.27859,2.194091,4.401565


In [13]:
import seaborn as sns
import plotly_express as px
import math

In [14]:
pop_df = pd.read_csv('../data/Cleaned/world_pop_long')
pop_df['Location'].unique()

array(['World', 'More developed regions', 'Less developed regions',
       'Least developed countries',
       'Less developed regions, excluding least developed countries',
       'Less developed regions, excluding China',
       'Land-locked Developing Countries (LLDC)',
       'Small Island Developing States (SIDS)', 'High-income countries',
       'Middle-income countries', 'Upper-middle-income countries',
       'Lower-middle-income countries', 'Low-income countries',
       'No income group available', 'Africa', 'Asia', 'Europe',
       'Latin America and the Caribbean', 'Northern America', 'Oceania',
       'Sub-Saharan Africa', 'Eastern Africa', 'Burundi', 'Comoros',
       'Djibouti', 'Eritrea', 'Ethiopia', 'Kenya', 'Madagascar', 'Malawi',
       'Mauritius', 'Mayotte', 'Mozambique', 'Réunion', 'Rwanda',
       'Seychelles', 'Somalia', 'South Sudan', 'Uganda', 'Tanzania',
       'Zambia', 'Zimbabwe', ' Middle Africa', 'Angola', 'Cameroon',
       'Central African Republic', 'C

After importing the long population dataset I developed in (data_analysis_Anxiety Investigation), I merged it with my depression dataset.

In [15]:
dp_df = merged_df.merge(pop_df, left_on=['Year', 'Entity'], right_on=['variable', 'Location'])
dp_df.shape

(5320, 13)

I successfully merged 189 countries into this new dataset.

In [16]:
dp_df['Entity'].nunique()

189

I then created a new variable that represented each country's actual population, since the original value was by thousand.

In [17]:
dp_df['Population']=dp_df['value']*1000

In [18]:
dp_df.head()

Unnamed: 0.1,Entity,Code,Year,Depression_percent,Unnamed: 0,Country,Region,Continent,code,Location,Average,variable,value,Population
0,Afghanistan,AFG,1990,4.071831,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1990,12412.0,12412000.0
1,Afghanistan,AFG,1991,4.079531,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1991,13299.0,13299000.0
2,Afghanistan,AFG,1992,4.088358,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1992,14486.0,14486000.0
3,Afghanistan,AFG,1993,4.09619,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1993,15817.0,15817000.0
4,Afghanistan,AFG,1994,4.099582,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1994,17076.0,17076000.0


Next, I created a variable to figure out the number of individuals with depression for each datapoint.

In [19]:
dp_df['Depression_population']=dp_df['Depression_percent']/100 * dp_df['Population']
dp_df.head()

Unnamed: 0.1,Entity,Code,Year,Depression_percent,Unnamed: 0,Country,Region,Continent,code,Location,Average,variable,value,Population,Depression_population
0,Afghanistan,AFG,1990,4.071831,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1990,12412.0,12412000.0,505395.685697
1,Afghanistan,AFG,1991,4.079531,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1991,13299.0,13299000.0,542536.819099
2,Afghanistan,AFG,1992,4.088358,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1992,14486.0,14486000.0,592239.575077
3,Afghanistan,AFG,1993,4.09619,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1993,15817.0,15817000.0,647894.356027
4,Afghanistan,AFG,1994,4.099582,0,Afghanistan,Southern Asia,Asia,4,Afghanistan,24737.62069,1994,17076.0,17076000.0,700044.550287


I dropped the replicated variables in the data.

In [20]:
dp3_df = dp_df.drop(['Country', 'Location','value', 'variable', 'code'], axis=1)

I renamed the average column to 'Average Pop' to make it more clear that it represented the average population of a country between 1990 and 2017.

In [21]:
dp3_df.rename(columns={'Average':'Average_pop'})

Unnamed: 0.1,Entity,Code,Year,Depression_percent,Unnamed: 0,Region,Continent,Average_pop,Population,Depression_population
0,Afghanistan,AFG,1990,4.071831,0,Southern Asia,Asia,24737.620690,12412000.0,5.053957e+05
1,Afghanistan,AFG,1991,4.079531,0,Southern Asia,Asia,24737.620690,13299000.0,5.425368e+05
2,Afghanistan,AFG,1992,4.088358,0,Southern Asia,Asia,24737.620690,14486000.0,5.922396e+05
3,Afghanistan,AFG,1993,4.096190,0,Southern Asia,Asia,24737.620690,15817000.0,6.478944e+05
4,Afghanistan,AFG,1994,4.099582,0,Southern Asia,Asia,24737.620690,17076000.0,7.000446e+05
5,Afghanistan,AFG,1995,4.104207,0,Southern Asia,Asia,24737.620690,18111000.0,7.433130e+05
6,Afghanistan,AFG,1996,4.107500,0,Southern Asia,Asia,24737.620690,18853000.0,7.743871e+05
7,Afghanistan,AFG,1997,4.110834,0,Southern Asia,Asia,24737.620690,19357000.0,7.957341e+05
8,Afghanistan,AFG,1998,4.114438,0,Southern Asia,Asia,24737.620690,19738000.0,8.121077e+05
9,Afghanistan,AFG,1999,4.117633,0,Southern Asia,Asia,24737.620690,20171000.0,8.305679e+05


I imported this cleaned depression dataset to a CSV.

In [22]:
dp3_df.to_csv('../data/Cleaned/depression_clean', index=False)