In [1]:
%matplotlib inline
import warnings
warnings.simplefilter('ignore')
import pandas as pd
import matplotlib.pyplot as plt

I imported my disorders dataset and did some initial investigations.

In [2]:
disorders_df = pd.read_csv('../data/Raw/mental-and-substance-disorders.csv')
disorders_df.shape

(6468, 4)

In [3]:
disorders_df.head()

Unnamed: 0,Country,Code,Year,Prevalence - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Percent) (%)
0,Afghanistan,AFG,1990,17.553463
1,Afghanistan,AFG,1991,17.837032
2,Afghanistan,AFG,1992,18.092542
3,Afghanistan,AFG,1993,18.294931
4,Afghanistan,AFG,1994,18.428908


I renamed a column to make it easier to manipulate.

In [4]:
d2_df = disorders_df.rename(columns={'Prevalence - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Percent) (%)':'Disorders(%)'})

I then imported the continents dataset and merged it with the disorders dataset.

In [5]:
continents = pd.read_csv('../data/Raw/countries_continents2.csv')
continents.shape

(249, 4)

In [6]:
merged_df = d2_df.merge(continents, left_on='Country', right_on='Country')

I then ran some investigations on this merged dataset, and changed some column names to make them easier to manipulate.

In [7]:
merged_df.shape

(5348, 7)

In [8]:
merged_df.sample(12)

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent
1969,Guinea,GIN,1999,11.587689,97,Western Africa,Africa
5120,United States Virgin Islands,VIR,2014,13.829884,238,Caribbean,North America
219,Argentina,ARG,2013,15.709712,10,South America,South America
4019,Saint Vincent and the Grenadines,VCT,2005,13.355522,191,Caribbean,North America
2454,Kazakhstan,KAZ,2008,11.275321,118,Central Asia,Asia
2852,Malawi,MWI,2014,12.224887,133,Eastern Africa,Africa
2249,Iraq,IRQ,1999,14.0439,109,Western Asia,Asia
104,American Samoa,ASM,2010,10.875523,4,Polynesia,Oceania
797,Burkina Faso,BFA,2003,11.923754,36,Western Africa,Africa
2087,Honduras,HND,2005,10.641833,103,Central America,North America


In [9]:
merged_df.rename(columns={'Country_x':'Continent',
                         'Prevalence - Mental and substance use disorders - Sex: Both - Age: Age-standardized (Percent) (%)':'Percent_of_Population',
                         'Country_y':'Country'})

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent
0,Afghanistan,AFG,1990,17.553463,0,Southern Asia,Asia
1,Afghanistan,AFG,1991,17.837032,0,Southern Asia,Asia
2,Afghanistan,AFG,1992,18.092542,0,Southern Asia,Asia
3,Afghanistan,AFG,1993,18.294931,0,Southern Asia,Asia
4,Afghanistan,AFG,1994,18.428908,0,Southern Asia,Asia
5,Afghanistan,AFG,1995,18.483440,0,Southern Asia,Asia
6,Afghanistan,AFG,1996,18.481194,0,Southern Asia,Asia
7,Afghanistan,AFG,1997,18.463648,0,Southern Asia,Asia
8,Afghanistan,AFG,1998,18.439805,0,Southern Asia,Asia
9,Afghanistan,AFG,1999,18.402616,0,Southern Asia,Asia


In [10]:
merged_df['Year'].nunique()

28

There are data from 28 years in this dataset.

In [11]:
merged_df['Year'].unique()
#Each year between 1990 and 2017 is accounted for

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 2014, 2015, 2016, 2017])

In [12]:
merged_df['Continent'].nunique()

6

In [13]:
merged_df.sample(12)

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent
1197,Cote d'Ivoire,CIV,2011,11.395123,56,Western Africa,Africa
2395,Japan,JPN,2005,12.037519,115,Eastern Asia,Asia
1797,Ghana,GHA,1995,11.526194,88,Western Africa,Africa
3338,Netherlands,NLD,1996,16.591215,155,Western Europe,Europe
4324,Solomon Islands,SLB,2002,11.462637,205,Melanesia,Oceania
1576,Ethiopia,ETH,1998,13.585206,75,Eastern Africa,Africa
4551,Suriname,SUR,2005,14.301534,214,South America,South America
2539,Kuwait,KWT,2009,13.965331,121,Western Asia,Asia
3731,Peru,PER,1997,12.136136,172,South America,South America
3197,Morocco,MAR,1995,16.46001,149,Northern Africa,Africa


I then imported the long population dataset I created in (data_analysis/Anxiety Investigation) and merged it with the merged dataset.

In [14]:
pop_df = pd.read_csv('../data/Cleaned/world_pop_long')
pop_df.head()

Unnamed: 0,code,Location,Average,variable,value
0,900,World,6472563.0,1990,5327231.0
1,901,More developed regions,1207273.0,1990,1145508.0
2,902,Less developed regions,5265290.0,1990,4181723.0
3,941,Least developed countries,737016.9,1990,506276.0
4,934,"Less developed regions, excluding least develo...",4528273.0,1990,3675448.0


In [15]:
d3_df = merged_df.merge(pop_df, left_on=['Year', 'Country'], right_on=['variable', 'Location'])

In [16]:
d3_df.head()

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent,code,Location,Average,variable,value
0,Afghanistan,AFG,1990,17.553463,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1990,12412.0
1,Afghanistan,AFG,1991,17.837032,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1991,13299.0
2,Afghanistan,AFG,1992,18.092542,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1992,14486.0
3,Afghanistan,AFG,1993,18.294931,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1993,15817.0
4,Afghanistan,AFG,1994,18.428908,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1994,17076.0


I added a new variable that reflected the actual population, since it was originally only represented per 1000.

In [17]:
d3_df['Population']=d3_df['value']*1000

I created another variable that represents the number of individuals in the population afficted with disorders.

In [18]:
d3_df['Disorders_population']=d3_df['Disorders(%)']/100 * d3_df['Population']

In [19]:
d3_df.head()

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent,code,Location,Average,variable,value,Population,Disorders_population
0,Afghanistan,AFG,1990,17.553463,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1990,12412.0,12412000.0,2178736.0
1,Afghanistan,AFG,1991,17.837032,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1991,13299.0,13299000.0,2372147.0
2,Afghanistan,AFG,1992,18.092542,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1992,14486.0,14486000.0,2620886.0
3,Afghanistan,AFG,1993,18.294931,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1993,15817.0,15817000.0,2893709.0
4,Afghanistan,AFG,1994,18.428908,0,Southern Asia,Asia,4,Afghanistan,24737.62069,1994,17076.0,17076000.0,3146920.0


I then dropped repeated or irrelevant columns.

In [20]:
d4_df = d3_df.drop(['Location','code', 'variable', 'value'], axis=1)

In [21]:
d4_df.head()

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent,Average,Population,Disorders_population
0,Afghanistan,AFG,1990,17.553463,0,Southern Asia,Asia,24737.62069,12412000.0,2178736.0
1,Afghanistan,AFG,1991,17.837032,0,Southern Asia,Asia,24737.62069,13299000.0,2372147.0
2,Afghanistan,AFG,1992,18.092542,0,Southern Asia,Asia,24737.62069,14486000.0,2620886.0
3,Afghanistan,AFG,1993,18.294931,0,Southern Asia,Asia,24737.62069,15817000.0,2893709.0
4,Afghanistan,AFG,1994,18.428908,0,Southern Asia,Asia,24737.62069,17076000.0,3146920.0


I renamed a column.

In [22]:
d5_df = d4_df.rename(columns={'Average':'Average_pop'})
d5_df.head()

Unnamed: 0.1,Country,Code,Year,Disorders(%),Unnamed: 0,Region,Continent,Average_pop,Population,Disorders_population
0,Afghanistan,AFG,1990,17.553463,0,Southern Asia,Asia,24737.62069,12412000.0,2178736.0
1,Afghanistan,AFG,1991,17.837032,0,Southern Asia,Asia,24737.62069,13299000.0,2372147.0
2,Afghanistan,AFG,1992,18.092542,0,Southern Asia,Asia,24737.62069,14486000.0,2620886.0
3,Afghanistan,AFG,1993,18.294931,0,Southern Asia,Asia,24737.62069,15817000.0,2893709.0
4,Afghanistan,AFG,1994,18.428908,0,Southern Asia,Asia,24737.62069,17076000.0,3146920.0


I finally imported the clean dataset for further use.

In [23]:
d5_df.to_csv('../data/Cleaned/disorders_clean', index=False)