The purpose of this notebook is to merge together all of my datasets to manipulate and compare variables.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly_express as px
import math
import geopandas as gpd
from shapely.geometry import Point
import json

In [2]:
anxiety_df = pd.read_csv('../data/Cleaned/anxiety_clean')
anxiety_df['Entity'].nunique()

189

In [3]:
countries_df = pd.read_csv('../data/Cleaned/countries.geojson')
countries_df.columns

Index(['Unnamed: 0', 'ADMIN', 'ISO_A3', 'ISO_A2', 'geometry'], dtype='object')

In [4]:
depression_df = pd.read_csv('../data/Cleaned/depression_clean')
depression_df['Entity'].nunique()

189

I merged together my anxiety and depression datasets.

In [5]:
ad_df = anxiety_df.merge(depression_df, left_on=['Entity', 'Year'], right_on=['Entity', 'Year'])         


In [6]:
anxiety_df.shape

(5320, 10)

In [7]:
depression_df.shape

(5320, 10)

In [8]:
ad_df.shape

(5376, 18)

No data was lost in the merge. I then dropped an irrelevant column.

In [9]:
ad_df.drop(['Average'], axis=1)

Unnamed: 0,Entity,Code_x,Year,Anxiety_percent,Unnamed: 0_x,Region_x,Continent_x,Average_pop,Population_x,Anxiety_population,Code_y,Depression_percent,Unnamed: 0_y,Region_y,Continent_y,Population_y,Depression_population
0,Afghanistan,AFG,1990,4.828830,0,Southern Asia,Asia,24737.620690,12412000.0,5.993543e+05,AFG,4.071831,0,Southern Asia,Asia,12412000.0,5.053957e+05
1,Afghanistan,AFG,1991,4.829740,0,Southern Asia,Asia,24737.620690,13299000.0,6.423072e+05,AFG,4.079531,0,Southern Asia,Asia,13299000.0,5.425368e+05
2,Afghanistan,AFG,1992,4.831108,0,Southern Asia,Asia,24737.620690,14486000.0,6.998344e+05,AFG,4.088358,0,Southern Asia,Asia,14486000.0,5.922396e+05
3,Afghanistan,AFG,1993,4.830864,0,Southern Asia,Asia,24737.620690,15817000.0,7.640977e+05,AFG,4.096190,0,Southern Asia,Asia,15817000.0,6.478944e+05
4,Afghanistan,AFG,1994,4.829423,0,Southern Asia,Asia,24737.620690,17076000.0,8.246723e+05,AFG,4.099582,0,Southern Asia,Asia,17076000.0,7.000446e+05
5,Afghanistan,AFG,1995,4.828337,0,Southern Asia,Asia,24737.620690,18111000.0,8.744602e+05,AFG,4.104207,0,Southern Asia,Asia,18111000.0,7.433130e+05
6,Afghanistan,AFG,1996,4.828083,0,Southern Asia,Asia,24737.620690,18853000.0,9.102385e+05,AFG,4.107500,0,Southern Asia,Asia,18853000.0,7.743871e+05
7,Afghanistan,AFG,1997,4.827726,0,Southern Asia,Asia,24737.620690,19357000.0,9.345029e+05,AFG,4.110834,0,Southern Asia,Asia,19357000.0,7.957341e+05
8,Afghanistan,AFG,1998,4.826971,0,Southern Asia,Asia,24737.620690,19738000.0,9.527476e+05,AFG,4.114438,0,Southern Asia,Asia,19738000.0,8.121077e+05
9,Afghanistan,AFG,1999,4.826413,0,Southern Asia,Asia,24737.620690,20171000.0,9.735357e+05,AFG,4.117633,0,Southern Asia,Asia,20171000.0,8.305679e+05


In [10]:
disorders_df = pd.read_csv('../data/Cleaned/disorders_clean')
disorders_df['Country'].nunique()

189

In [11]:
disorders_df.shape

(5320, 10)

I then merged the anxiety-depression dataset with the disorders dataset.

In [12]:
add_df = ad_df.merge(disorders_df, left_on=['Entity', 'Year'], right_on=['Country', 'Year'])

In [13]:
add_df['Entity'].nunique()

189

In [14]:
add_df.columns

Index(['Entity', 'Code_x', 'Year', 'Anxiety_percent', 'Unnamed: 0_x',
       'Region_x', 'Continent_x', 'Average_pop_x', 'Population_x',
       'Anxiety_population', 'Code_y', 'Depression_percent', 'Unnamed: 0_y',
       'Region_y', 'Continent_y', 'Average', 'Population_y',
       'Depression_population', 'Country', 'Code', 'Disorders(%)',
       'Unnamed: 0', 'Region', 'Continent', 'Average_pop_y', 'Population',
       'Disorders_population'],
      dtype='object')

I dropped irrelevant columns.

In [15]:
add2_df = add_df.drop(['Code_y', 'Region_y'], axis=1)

In [16]:
add_df.shape

(5488, 27)

In [17]:
add_df.head()

Unnamed: 0.1,Entity,Code_x,Year,Anxiety_percent,Unnamed: 0_x,Region_x,Continent_x,Average_pop_x,Population_x,Anxiety_population,...,Depression_population,Country,Code,Disorders(%),Unnamed: 0,Region,Continent,Average_pop_y,Population,Disorders_population
0,Afghanistan,AFG,1990,4.82883,0,Southern Asia,Asia,24737.62069,12412000.0,599354.342985,...,505395.685697,Afghanistan,AFG,17.553463,0,Southern Asia,Asia,24737.62069,12412000.0,2178736.0
1,Afghanistan,AFG,1991,4.82974,0,Southern Asia,Asia,24737.62069,13299000.0,642307.172127,...,542536.819099,Afghanistan,AFG,17.837032,0,Southern Asia,Asia,24737.62069,13299000.0,2372147.0
2,Afghanistan,AFG,1992,4.831108,0,Southern Asia,Asia,24737.62069,14486000.0,699834.357854,...,592239.575077,Afghanistan,AFG,18.092542,0,Southern Asia,Asia,24737.62069,14486000.0,2620886.0
3,Afghanistan,AFG,1993,4.830864,0,Southern Asia,Asia,24737.62069,15817000.0,764097.693301,...,647894.356027,Afghanistan,AFG,18.294931,0,Southern Asia,Asia,24737.62069,15817000.0,2893709.0
4,Afghanistan,AFG,1994,4.829423,0,Southern Asia,Asia,24737.62069,17076000.0,824672.301042,...,700044.550287,Afghanistan,AFG,18.428908,0,Southern Asia,Asia,24737.62069,17076000.0,3146920.0


In [18]:
add_df.shape

(5488, 27)

In [19]:
add2_df.drop(['Average'], axis=1)
add2_df.shape

(5488, 25)

In [20]:
add_df.head()

Unnamed: 0.1,Entity,Code_x,Year,Anxiety_percent,Unnamed: 0_x,Region_x,Continent_x,Average_pop_x,Population_x,Anxiety_population,...,Depression_population,Country,Code,Disorders(%),Unnamed: 0,Region,Continent,Average_pop_y,Population,Disorders_population
0,Afghanistan,AFG,1990,4.82883,0,Southern Asia,Asia,24737.62069,12412000.0,599354.342985,...,505395.685697,Afghanistan,AFG,17.553463,0,Southern Asia,Asia,24737.62069,12412000.0,2178736.0
1,Afghanistan,AFG,1991,4.82974,0,Southern Asia,Asia,24737.62069,13299000.0,642307.172127,...,542536.819099,Afghanistan,AFG,17.837032,0,Southern Asia,Asia,24737.62069,13299000.0,2372147.0
2,Afghanistan,AFG,1992,4.831108,0,Southern Asia,Asia,24737.62069,14486000.0,699834.357854,...,592239.575077,Afghanistan,AFG,18.092542,0,Southern Asia,Asia,24737.62069,14486000.0,2620886.0
3,Afghanistan,AFG,1993,4.830864,0,Southern Asia,Asia,24737.62069,15817000.0,764097.693301,...,647894.356027,Afghanistan,AFG,18.294931,0,Southern Asia,Asia,24737.62069,15817000.0,2893709.0
4,Afghanistan,AFG,1994,4.829423,0,Southern Asia,Asia,24737.62069,17076000.0,824672.301042,...,700044.550287,Afghanistan,AFG,18.428908,0,Southern Asia,Asia,24737.62069,17076000.0,3146920.0


I imported the suicide dataset and merged it with the add dataset.

In [21]:
suicide_df = pd.read_csv('../data/Cleaned/suicide-clean')
suicide_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000,Average_pop,Population,#suicides
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7,24737.62069,35383000.0,1663.001
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3,3055.275862,2886000.0,181.818
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2,33273.448276,40551000.0,1297.632
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7,19765.931034,28842000.0,1355.574
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5,80.137931,95000.0,0.475


In [22]:
suicide_df.shape

(895, 9)

In [23]:
adds = add_df.merge(suicide_df, left_on=['Entity', 'Year', 'Region', 'Continent', 'Population'], right_on=['Country', 'Year', 'Region', 'Continent', 'Population'])

In [24]:
adds['Entity'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia',
       'Cameroon', 'Canada', 'Cape Verde', 'Central African Republic',
       'Chad', 'Chile', 'China', 'Colombia', 'Comoros', 'Congo',
       'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Denmark', 'Djibouti',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji',
       'Finland', 'France', 'Gabon', 'Gambia', 'Georgia', 'Germany',
       'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea',
       'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary',
       'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland',
  

In [25]:
adds.shape

(925, 32)

In [26]:
adds.columns

Index(['Entity', 'Code_x', 'Year', 'Anxiety_percent', 'Unnamed: 0_x',
       'Region_x', 'Continent_x', 'Average_pop_x', 'Population_x',
       'Anxiety_population', 'Code_y', 'Depression_percent', 'Unnamed: 0_y',
       'Region_y', 'Continent_y', 'Average', 'Population_y',
       'Depression_population', 'Country_x', 'Code', 'Disorders(%)',
       'Unnamed: 0', 'Region', 'Continent', 'Average_pop_y', 'Population',
       'Disorders_population', 'Country_y', 'Sex', 'Suicides_per100000',
       'Average_pop', '#suicides'],
      dtype='object')

In [27]:
add_df['Entity'].nunique()

189

In [28]:
adds['Entity'].nunique()

178

Only 8 countries were lost in this large merge.

In [29]:
adds.shape

(925, 32)

I exported this combination of four datasets into a new CSV.

In [30]:
adds.to_csv('../data/Cleaned/suicideanxietydepressiondisorders', index=False)

In [31]:
adds['Entity'].nunique()

178

I then imported the resources dataset, the least geographically representative one and then I merged it with the previous large one to create the "megaset."

In [32]:
resources_df = pd.read_csv('../data/Cleaned/resources_clean')
resources_df['Country'].nunique()

151

In [33]:
the_megaset = adds.merge(resources_df, left_on=['Entity', 'Year'], right_on=['Country', 'Year'])

In [34]:
the_megaset['Entity'].nunique()

123

In [35]:
the_megaset.columns

Index(['Entity', 'Code_x', 'Year', 'Anxiety_percent', 'Unnamed: 0_x',
       'Region_x', 'Continent_x', 'Average_pop_x', 'Population_x',
       'Anxiety_population', 'Code_y', 'Depression_percent', 'Unnamed: 0_y',
       'Region_y', 'Continent_y', 'Average', 'Population_y',
       'Depression_population', 'Country_x', 'Code', 'Disorders(%)',
       'Unnamed: 0', 'Region_x', 'Continent_x', 'Average_pop_y',
       'Population_x', 'Disorders_population', 'Country_y', 'Sex',
       'Suicides_per100000', 'Average_pop', '#suicides', 'Country',
       'Psychiatrists_per100000', 'Nurses_per100000',
       'Social_workers_per100000', 'Psychologists_per100000', 'Region_y',
       'Continent_y', 'Population_y', '#psychiatrists', '#nurses',
       '#social_workers', '#psychologists', 'Total_resources'],
      dtype='object')

I then dropped irrelevant columns and renamed the relevant ones.

In [36]:
the_megaset2 = the_megaset.drop(['Average', 'Average_pop_y', 'Country_x', 'Country_y', 'Country', 'Unnamed: 0_y', 'Code_y', 'Unnamed: 0_x'], axis=1)

In [37]:
the_megaset3 = the_megaset2.rename(columns={'Average_pop_x':'Average_pop', 'Year_y':'Year', 'Continent_x':'Continent'})

In [38]:
the_megaset3.shape

(138, 37)

I merged this dataset with a GeoJson file that contained coordinated for each country, and dropped irrelevant columns.

In [39]:
the_megaset4 = the_megaset3.merge(countries_df, left_on=['Entity', 'Code'], right_on=['ADMIN', 'ISO_A3'])
the_megaset4.head()

Unnamed: 0,Entity,Code_x,Year,Anxiety_percent,Region_x,Continent,Average_pop,Population_x,Anxiety_population,Depression_percent,...,#psychiatrists,#nurses,#social_workers,#psychologists,Total_resources,Unnamed: 0_y,ADMIN,ISO_A3,ISO_A2,geometry
0,Afghanistan,AFG,2016,4.878875,Southern Asia,Asia,24737.62069,35383000.0,1726292.0,4.135694,...,81.73473,34.67534,,104.73368,,1,Afghanistan,AFG,AF,MULTIPOLYGON (((71.04980228700009 38.408664450...
1,Albania,ALB,2016,3.386891,Southern Europe,Europe,3055.275862,2886000.0,97745.69,2.206507,...,42.45306,198.44136,30.5916,35.52666,307.01268,4,Albania,ALB,AL,MULTIPOLYGON (((19.74776574700007 42.578900859...
2,Angola,AGO,2016,3.294534,Middle Africa,Africa,19765.931034,28842000.0,950209.5,4.157841,...,16.43994,190.3572,6.34524,51.62718,264.76956,2,Angola,AGO,AO,MULTIPOLYGON (((11.73751945100014 -16.69257798...
3,Antigua and Barbuda,ATG,2016,4.607747,Caribbean,North America,80.137931,95000.0,4377.36,2.551108,...,0.95095,6.65475,3.80285,,,14,Antigua and Barbuda,ATG,AG,MULTIPOLYGON (((-61.77301998599992 17.12653229...
4,Argentina,ARG,2016,6.280932,South America,South America,38496.068966,43508000.0,2732708.0,3.661312,...,9443.4114,,,96836.62576,,8,Argentina,ARG,AR,MULTIPOLYGON (((-68.65412350199998 -54.8862443...


In [40]:
the_megaset5=the_megaset4.drop(['ADMIN', 'ISO_A3', 'ISO_A2'], axis=1)
the_megaset5.head()

Unnamed: 0,Entity,Code_x,Year,Anxiety_percent,Region_x,Continent,Average_pop,Population_x,Anxiety_population,Depression_percent,...,Region_y,Continent_y,Population_y,#psychiatrists,#nurses,#social_workers,#psychologists,Total_resources,Unnamed: 0_y,geometry
0,Afghanistan,AFG,2016,4.878875,Southern Asia,Asia,24737.62069,35383000.0,1726292.0,4.135694,...,Southern Asia,Asia,35383000.0,81.73473,34.67534,,104.73368,,1,MULTIPOLYGON (((71.04980228700009 38.408664450...
1,Albania,ALB,2016,3.386891,Southern Europe,Europe,3055.275862,2886000.0,97745.69,2.206507,...,Southern Europe,Europe,2886000.0,42.45306,198.44136,30.5916,35.52666,307.01268,4,MULTIPOLYGON (((19.74776574700007 42.578900859...
2,Angola,AGO,2016,3.294534,Middle Africa,Africa,19765.931034,28842000.0,950209.5,4.157841,...,Middle Africa,Africa,28842000.0,16.43994,190.3572,6.34524,51.62718,264.76956,2,MULTIPOLYGON (((11.73751945100014 -16.69257798...
3,Antigua and Barbuda,ATG,2016,4.607747,Caribbean,North America,80.137931,95000.0,4377.36,2.551108,...,Caribbean,North America,95000.0,0.95095,6.65475,3.80285,,,14,MULTIPOLYGON (((-61.77301998599992 17.12653229...
4,Argentina,ARG,2016,6.280932,South America,South America,38496.068966,43508000.0,2732708.0,3.661312,...,South America,South America,43508000.0,9443.4114,,,96836.62576,,8,MULTIPOLYGON (((-68.65412350199998 -54.8862443...


In [41]:
the_megaset5.columns

Index(['Entity', 'Code_x', 'Year', 'Anxiety_percent', 'Region_x', 'Continent',
       'Average_pop', 'Population_x', 'Anxiety_population',
       'Depression_percent', 'Region_y', 'Continent_y', 'Population_y',
       'Depression_population', 'Code', 'Disorders(%)', 'Unnamed: 0_x',
       'Region_x', 'Continent', 'Population_x', 'Disorders_population', 'Sex',
       'Suicides_per100000', 'Average_pop', '#suicides',
       'Psychiatrists_per100000', 'Nurses_per100000',
       'Social_workers_per100000', 'Psychologists_per100000', 'Region_y',
       'Continent_y', 'Population_y', '#psychiatrists', '#nurses',
       '#social_workers', '#psychologists', 'Total_resources', 'Unnamed: 0_y',
       'geometry'],
      dtype='object')

I renamed some columns.

In [42]:
the_megaset6=the_megaset5.rename(columns={'Code_x':'Code', 'Year_x':'Year', 'Region_x':'Region', 'Continent_x':'Continent'})

In [53]:
the_megaset6.shape

(120, 39)

In [55]:
the_megaset6['Entity'].value_counts()

South Africa    1
Rwanda          1
Cyprus          1
Syria           1
Mozambique      1
Belarus         1
Tajikistan      1
Romania         1
Kyrgyzstan      1
Sudan           1
Luxembourg      1
Hungary         1
Vanuatu         1
Japan           1
Peru            1
Samoa           1
New Zealand     1
Colombia        1
Grenada         1
Cuba            1
Angola          1
Panama          1
Argentina       1
Afghanistan     1
Togo            1
Haiti           1
Australia       1
Albania         1
Burkina Faso    1
Thailand        1
               ..
Guyana          1
Chile           1
Guinea          1
Philippines     1
Barbados        1
Moldova         1
Slovenia        1
Mexico          1
Bolivia         1
Ecuador         1
Saint Lucia     1
Georgia         1
Armenia         1
Fiji            1
Seychelles      1
Mongolia        1
Mali            1
Liberia         1
Netherlands     1
Zambia          1
Latvia          1
Egypt           1
Guatemala       1
Brazil          1
Poland    

Complete, I exported this new "megaset." I recognize this data lacks much of my other data and is flawed, but it will still be helpful.

In [56]:
the_megaset6.to_csv('../data/Cleaned/megaset', index=False)

In [44]:
resources_df.shape

(152, 14)

In [45]:
suicide_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000,Average_pop,Population,#suicides
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7,24737.62069,35383000.0,1663.001
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3,3055.275862,2886000.0,181.818
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2,33273.448276,40551000.0,1297.632
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7,19765.931034,28842000.0,1355.574
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5,80.137931,95000.0,0.475


I returned to the subset of the data that only included anxiety, depression, and disorders, as this was the most geographically representative and comprehensive subset. I then merged it with the geographical data.

In [46]:
add_df['Entity'].nunique()

189

In [47]:
addc_df = add_df.merge(countries_df, left_on='Entity', right_on='ADMIN')

In [48]:
addc_df.columns

Index(['Entity', 'Code_x', 'Year', 'Anxiety_percent', 'Unnamed: 0_x',
       'Region_x', 'Continent_x', 'Average_pop_x', 'Population_x',
       'Anxiety_population', 'Code_y', 'Depression_percent', 'Unnamed: 0_y',
       'Region_y', 'Continent_y', 'Average', 'Population_y',
       'Depression_population', 'Country', 'Code', 'Disorders(%)',
       'Unnamed: 0_x', 'Region', 'Continent', 'Average_pop_y', 'Population',
       'Disorders_population', 'Unnamed: 0_y', 'ADMIN', 'ISO_A3', 'ISO_A2',
       'geometry'],
      dtype='object')

I renamed the columns.

In [49]:
addc3_df = addc_df.rename(columns={'Code_x':'Code',
                                   'Region_x':'Region',
                                   'Continent_x':'Continent',
                                   'Average_pop_x':'Average_pop',
                                   'Population_x':'Population'})

In [50]:
addc3_df['Entity'].nunique()

186

I exported this subset as a CSV.

In [51]:
addc3_df.to_csv('../data/Cleaned/anxietydepressiondisorders', index=False)