# EDA Codes
for the Study of Suicide Rates by Year and Country

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('fivethirtyeight')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.2f}'.format

%matplotlib inline

In [None]:
df = pd.read_csv(r'C:\Users\maggi\Desktop\suicide_ready.csv', low_memory=False)

In [None]:
# Let's start with some histograms of the quantitative data in our dataset
df.hist(grid=True,figsize=(10,15),color='blue')
plt.show()

The distribution of 'HID per year' and 'year' are negatively skewed, whilst the others are all positively skewed. 

In [None]:
# Suicides by sex and age scatterplot grids
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="darkgrid")

g = sns.FacetGrid(df, row="sex", col="age", margin_titles=True)

g.map(plt.scatter, "suicides","population", edgecolor="w")

The grid that stands out in terms of the highest number of suicides is the male between 35-54 years. 

In [None]:
# Presenting the number of suicides by age in a table
suicides_sex_age = df[['sex', 'age', 'suicides']].groupby(['sex', 'age']).mean()
suicides_sex_age

In [None]:
#Let's re-arrange age categories into ascending order and remove the wording
suicides_sex_age.reset_index(inplace=True)
suicides_sex_age['age'] = suicides_sex_age['age'].str.replace(' years', '')

age_sort = {'5-14': 0, '15-24': 1, '25-34': 2, '35-54': 3, '55-74':4,'75+': 5} 
suicides_sex_age['sort'] = suicides_sex_age['age'].map(age_sort)
suicides_sex_age.sort_values(by='sort', inplace=True)
suicides_sex_age.drop('sort', axis=1, inplace=True)

sns.barplot(x='sex', y='suicides', hue='age', data=suicides_sex_age)

In [None]:
#Add labels and improve layout of histogram
age_groups = suicides_sex_age['age'].unique()
male_suicides = suicides_sex_age[suicides_sex_age['sex'] == 'male']['suicides']
female_suicides = suicides_sex_age[suicides_sex_age['sex'] == 'female']['suicides']

plt.bar(age_groups, male_suicides, label='Male')
plt.bar(age_groups, female_suicides, label='Female')

plt.title('Global average number of suicides by age')
plt.xlabel('Age Group')
plt.ylabel('Suicides')
plt.legend()
plt.show()

- This chart shows male suicide counts are significantly higher than females across all age groups.
- The 35-54 age group have the highest number of suicides for both male and female.
- Male suicides for most age groups are 3 times higher than females, and double in for the 2 extreme age groups (5-14 and 75+).

In [None]:
#Lets visualise the change in suicide numbers for age groups over the years

age_15 = df.loc[df.loc[:, 'age']=='15-24 years',:]
age_35 = df.loc[df.loc[:, 'age']=='35-54 years',:]
age_75 = df.loc[df.loc[:, 'age']=='75+ years',:]
age_25 = df.loc[df.loc[:, 'age']=='25-34 years',:]
age_55 = df.loc[df.loc[:, 'age']=='55-74 years',:]
age_5 = df.loc[df.loc[:, 'age']=='5-14 years',:]

In [None]:
p = sns.lineplot(x='year', y='suicides', data=age_5)
q = sns.lineplot(x='year', y='suicides', data=age_15)
r = sns.lineplot(x='year', y='suicides', data=age_25)
s = sns.lineplot(x='year', y='suicides', data=age_35)
t = sns.lineplot(x='year', y='suicides', data=age_55)
t = sns.lineplot(x='year', y='suicides', data=age_75)

_ = plt.legend(['5-14 years', '15-24 years', '25-34 years', '35-54 years', '55-74 years', '75+ years'])

The age group of 35-54 dominates this chart with the most changes

In [None]:
#Lets visualise the change in suicide numbers for both male and female over the years
male_population = df.loc[df.loc[:, 'sex']=='male',:]
female_population = df.loc[df.loc[:, 'sex']=='female',:]

In [None]:
p = sns.lineplot(x='year', y='suicides', data=male_population)
q = sns.lineplot(x='year', y='suicides', data=female_population)

_ = plt.legend(['males', 'females'])

- On average, male's suicide numbers are 3 times higher than females. 
- It seems the suicide rate was declining from 1994, until a sharp spike in 2015. The major event in 2015 was the Global Financial Crisis.

In [None]:
# to see the distribution of data by year
y = df['year']
sns.countplot(x='year', data=df)
plt.xticks(rotation=90)
plt.title("record count by year")
plt.show()

In [None]:
from bokeh.io import output_file, show, output_notebook
from bokeh.plotting import figure
import matplotlib.pyplot as plt
import pandas as pd
output_notebook()

%matplotlib inline

In [None]:
# create your figure and add it to a variable
plot = figure(plot_width=400, tools='pan, box_zoom')

# use your Glyph method of choice and pass in vars
plot.circle(df['year'], df['suicides'])
output_file('suicide_by_years.html') # will save the output as an html file

# show your figure
show(plot)

There appears to be very little amount of data in the year 2016

In [None]:
#suicides_vs_gpd

suicides_vs_gpd = df[['suicides_100kpop', 'year', 'gdp_per_capita']].groupby('year').mean()
suicides_vs_gpd.reset_index(inplace=True)
suicides_vs_gpd

In [None]:
fig, ax1 = plt.subplots()

# Plot the number of suicides over the years.
lns1 = ax1.plot(suicides_vs_gpd['year'], suicides_vs_gpd['suicides_100kpop'], 'C0', label='Suicides')

# Create a shared axis for plotting on a different scale for the GPD.
ax2 = ax1.twinx()
lns2 = ax2.plot(suicides_vs_gpd['year'], suicides_vs_gpd['gdp_per_capita'], 'C1', label='GPD')

# Join both legends into the same box.
lns = lns1 + lns2
labs = [l.get_label() for l in lns]
ax1.legend(lns, labs, loc=2)

# Set the labels.
ax1.set_ylabel('Suicides per 100,000 population')
ax2.set_ylabel('GDP per Capita')
ax1.set_xlabel('Years')

plt.tight_layout()
plt.show()

- This chart shows from 1995 onwards that as GDP increases, the number of suicide decreases. 
- 1995 will be around when the US Financial Crisis occured (https://www.cfr.org/timeline/us-financial-crisis)

In [None]:
#Pivot table of total number of suicides per country over the years
year_country = df[df['year']!=2016].pivot_table(
    index=['year'],
    values='suicides',
    columns='country',
    aggfunc='sum'
).T

year_country

- From the pivot table above, there appears to be quite a few countries with very few years of data. 
- I will remove those with less than 3 years of data if I was to clean this data again.

In [None]:
# Average suicide rate by country
suicides_country = df[['country', 'year', 'suicides']].groupby(['country', 'year']).mean()
g = sns.FacetGrid(suicides_country.groupby(['country','year']).suicides.sum().reset_index(), col='country', col_wrap=6, row_order='suicides')
g = g.map(plt.plot, "year", "suicides", marker=".")

- Not every country have recorded data consistently across the 2 decades, some from the beginning (e.g. Sri Lanka), others have data for a few years in the middle (e.g. Aruba), and there are those that recorded towards the end of the reporting period (e.g. Turkey), it will be fair to observe the average suicide count per year for each country.
- The top 10 countries with the highest average suicide rate in descending order are: Russian Federation, United States, Japan, Ukraine, Germany, France, Republic of Korea, Brazil, Poland and Sri Lanka.
- The countries continuing to trend upwards are Brazil and United States.