In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt

First, I imported the suicide dataset and ran some initial analyses.

In [2]:
suicide_df = pd.read_csv('../data/Raw/suicide_crude_by_country.csv')

In [3]:
suicide_df.head()

Unnamed: 0,Country,Sex,2016 (per 100000),2015 (per 100000),2010 (per 100000),2005 (per 100000),2000 (per 100000)
0,Afghanistan,Both sexes,4.7,4.8,5.1,6.3,5.7
1,Afghanistan,Male,7.6,7.8,8.6,10.8,10.0
2,Afghanistan,Female,1.5,1.5,1.4,1.5,1.0
3,Albania,Both sexes,6.3,6.0,7.8,6.7,5.5
4,Albania,Male,7.9,7.6,9.5,7.7,7.4


I renamed the columns to make them easier to manipulate.

In [4]:
suicide_df.columns = ['Country', 'Sex', '2016', '2015', '2010', '2005', '2000']

In [5]:
suicide_df['Country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia (Plurinational State of)', 'Bosnia and Herzegovina',
       'Botswana', 'Brazil', 'Brunei Darussalam', 'Bulgaria',
       'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon',
       'Canada', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Costa Rica', "Côte d'Ivoire",
       'Croatia', 'Cuba', 'Cyprus', 'Czechia',
       "Democratic People's Republic of Korea",
       'Democratic Republic of the Congo', 'Denmark', 'Djibouti',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
       'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Gree

I renamed some of the country names so that they would match those in my other datasets.

In [6]:
suicide_df['Country'] = suicide_df['Country'].replace({'Bolivia (Plurinational State of)':'Bolivia', 
                      'Brunei Darussalam':'Brunei', 'Cabo Verde':'Cape Verde', 'Côte d’Ivoire':"Cote d'Ivoire",
                            'Czechia':'Czech Republic', "Democratic People's Republic of Korea":'North Korea'})

In [7]:
suicide_df['Country'] = suicide_df['Country'].replace({'Eswatini':'Swaziland', 'Iran (Islamic Republic of)':'Iran',
                                      "Lao People's Democratic Republic":'Laos', 'Micronesia (Federated States of)':'Micronesia (country)',
                                      'Republic of Korea':'South Korea', 'Republic of Moldova':'Moldova',
                                       'Russian Federation':'Russia'})

In [8]:
suicide_df['Country'] = suicide_df['Country'].replace({'State of Palestine':'Palestine', 'Syrian Arab Republic':'Syria',
                                      'Timor-Leste':'Timor', 'United Kingdom of Great Britain and Northern Ireland':'United Kingdom',
                                      'United Republic of Tanzania':'Tanzania', 'United States of America':'United States',
                                      'Venezuela (Bolivarian Republic of)':'Venezuela', 'Viet Nam':'Vietnam'})

I imported the continents dataset and merged it with the suicide dataset.

In [9]:
continents_df = pd.read_csv('../data/Raw/countries_continents2.csv')
continents_df.head()

Unnamed: 0.1,Unnamed: 0,Country,Region,Continent
0,0,Afghanistan,Southern Asia,Asia
1,1,Åland Islands,Northern Europe,Europe
2,2,Albania,Southern Europe,Europe
3,3,Algeria,Northern Africa,Africa
4,4,American Samoa,Polynesia,Oceania


In [10]:
merged_df = suicide_df.merge(continents_df, left_on='Country', right_on='Country')
merged_df.head()

Unnamed: 0.1,Country,Sex,2016,2015,2010,2005,2000,Unnamed: 0,Region,Continent
0,Afghanistan,Both sexes,4.7,4.8,5.1,6.3,5.7,0,Southern Asia,Asia
1,Afghanistan,Male,7.6,7.8,8.6,10.8,10.0,0,Southern Asia,Asia
2,Afghanistan,Female,1.5,1.5,1.4,1.5,1.0,0,Southern Asia,Asia
3,Albania,Both sexes,6.3,6.0,7.8,6.7,5.5,2,Southern Europe,Europe
4,Albania,Male,7.9,7.6,9.5,7.7,7.4,2,Southern Europe,Europe


In [11]:
merged_df['Country'].nunique()

180

I subsected the data in a male suicides dataset, a female suicides dataset, and both sexes together.

In [12]:
male_suicide_df = merged_df[merged_df['Sex'] == 'Male']
male_suicide_df.head()

Unnamed: 0.1,Country,Sex,2016,2015,2010,2005,2000,Unnamed: 0,Region,Continent
1,Afghanistan,Male,7.6,7.8,8.6,10.8,10.0,0,Southern Asia,Asia
4,Albania,Male,7.9,7.6,9.5,7.7,7.4,2,Southern Europe,Europe
7,Algeria,Male,4.7,4.8,4.9,5.3,5.6,3,Northern Africa,Africa
10,Angola,Male,7.0,7.5,8.7,11.0,11.9,6,Middle Africa,Africa
13,Antigua and Barbuda,Male,0.0,0.0,0.5,2.4,4.2,9,Caribbean,North America


In [13]:
female_suicide_df = merged_df[merged_df['Sex'] == 'Female']
female_suicide_df.head()

Unnamed: 0.1,Country,Sex,2016,2015,2010,2005,2000,Unnamed: 0,Region,Continent
2,Afghanistan,Female,1.5,1.5,1.4,1.5,1.0,0,Southern Asia,Asia
5,Albania,Female,4.7,4.4,6.1,5.7,3.5,2,Southern Europe,Europe
8,Algeria,Female,1.7,1.7,1.8,2.2,2.5,3,Northern Africa,Africa
11,Angola,Female,2.5,2.6,2.8,3.7,4.0,6,Middle Africa,Africa
14,Antigua and Barbuda,Female,0.9,1.5,0.0,0.0,0.1,9,Caribbean,North America


In [14]:
both_sexes_suicide_df = merged_df[merged_df['Sex'] == 'Both sexes']
both_sexes_suicide_df.head()

Unnamed: 0.1,Country,Sex,2016,2015,2010,2005,2000,Unnamed: 0,Region,Continent
0,Afghanistan,Both sexes,4.7,4.8,5.1,6.3,5.7,0,Southern Asia,Asia
3,Albania,Both sexes,6.3,6.0,7.8,6.7,5.5,2,Southern Europe,Europe
6,Algeria,Both sexes,3.2,3.2,3.3,3.8,4.1,3,Northern Africa,Africa
9,Angola,Both sexes,4.7,5.0,5.7,7.2,7.9,6,Middle Africa,Africa
12,Antigua and Barbuda,Both sexes,0.5,0.8,0.3,1.2,2.0,9,Caribbean,North America


I then melted the both sexes dataset so that instead of each row having data from six different years, there would be a row with data per year.

In [15]:
suicide_long_df = both_sexes_suicide_df.melt(id_vars=['Country', 'Sex', 'Region', 'Continent'])
suicide_long_df.head()

Unnamed: 0,Country,Sex,Region,Continent,variable,value
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5


I then renamed the columns to explain what they represent.

In [16]:
suicide_long_df.columns=['Country', 'Sex', 'Region', 'Continent', 'Year', 'Suicides_per100000']
suicide_long_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5


In [17]:
pop_df = pd.read_csv('../data/Cleaned/world_pop_long')
pop_df.head()

Unnamed: 0,code,Location,Average,variable,value
0,900,World,6472563.0,1990,5327231.0
1,901,More developed regions,1207273.0,1990,1145508.0
2,902,Less developed regions,5265290.0,1990,4181723.0
3,941,Least developed countries,737016.9,1990,506276.0
4,934,"Less developed regions, excluding least develo...",4528273.0,1990,3675448.0


After importing the population dataset, I saved the year as a string so that it can merge with the suicide data. I then merged the two to create a dataset that included suicide information and population.

In [18]:
pop_df['variable']=pop_df['variable'].astype('str')

In [19]:
sp_df = suicide_long_df.merge(pop_df, left_on=['Country', 'Year'], right_on=['Location', 'variable'])

I renamed two columns to make them more clear.

In [20]:
suicide2_pop_df=sp_df.rename(columns={'value':'Pop1000s', 
                                              'Average':'Average_pop'})

I dropped several irrelelant columns.

In [21]:
suicide3_pop_df=suicide2_pop_df.drop(['Location', 'code', 'variable'], axis=1)
suicide3_pop_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000,Average_pop,Pop1000s
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7,24737.62069,35383.0
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3,3055.275862,2886.0
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2,33273.448276,40551.0
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7,19765.931034,28842.0
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5,80.137931,95.0


In [22]:
suicide3_pop_df['Country'].nunique()

178

I added a variable that reflected the actual population, and then deleted the original one which was meant to be multiplied by 1000.

In [23]:
suicide3_pop_df['Population']=suicide3_pop_df['Pop1000s']*1000
suicide3_pop_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000,Average_pop,Pop1000s,Population
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7,24737.62069,35383.0,35383000.0
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3,3055.275862,2886.0,2886000.0
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2,33273.448276,40551.0,40551000.0
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7,19765.931034,28842.0,28842000.0
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5,80.137931,95.0,95000.0


In [24]:
suicide4_df=suicide3_pop_df.drop(['Pop1000s'], axis=1)
suicide4_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000,Average_pop,Population
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7,24737.62069,35383000.0
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3,3055.275862,2886000.0
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2,33273.448276,40551000.0
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7,19765.931034,28842000.0
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5,80.137931,95000.0


I then created a variable calculating the number of suicides in total, and dropped the suicides per 100,000 column.

In [25]:
suicide4_df['#suicides']=suicide4_df['Suicides_per100000'] * suicide4_df['Population']/100000
suicide4_df.head()

Unnamed: 0,Country,Sex,Region,Continent,Year,Suicides_per100000,Average_pop,Population,#suicides
0,Afghanistan,Both sexes,Southern Asia,Asia,2016,4.7,24737.62069,35383000.0,1663.001
1,Albania,Both sexes,Southern Europe,Europe,2016,6.3,3055.275862,2886000.0,181.818
2,Algeria,Both sexes,Northern Africa,Africa,2016,3.2,33273.448276,40551000.0,1297.632
3,Angola,Both sexes,Middle Africa,Africa,2016,4.7,19765.931034,28842000.0,1355.574
4,Antigua and Barbuda,Both sexes,Caribbean,North America,2016,0.5,80.137931,95000.0,0.475


In [26]:
scp_df = suicide4_df

In [27]:
scp_df.shape

(895, 9)

In [28]:
scp_df['Year'].unique()

array(['2016', '2015', '2010', '2005', '2000'], dtype=object)

In [29]:
scp_df.columns

Index(['Country', 'Sex', 'Region', 'Continent', 'Year', 'Suicides_per100000',
       'Average_pop', 'Population', '#suicides'],
      dtype='object')

Finally, I imported the cleaned dataset to a CSV.

In [30]:
scp_df.to_csv('../data/Cleaned/suicide-clean', index=False)