Import all necessary libraries:

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.patches as mpatches
import seaborn as sns
from pandas import DataFrame
from matplotlib import rcParams
from ipywidgets import interact
from scipy import stats
import statsmodels.api as sm

Download the `master` dataset from [here](https://drive.google.com/drive/folders/1LuGD09bpxmIhOnQVYuT1NUKnBcvSIdBc) and place it in the data folder.
Load and save your dataset in a variable called `data`. 

In [None]:
data = pd.read_csv('../data/master.csv')
data.drop(columns=['country-year','HDI for year'],inplace=True)
data = data.rename(columns=lambda x: x.strip())
data['gdp_per_capita ($)'] = data['gdp_per_capita ($)'].astype(np.float64)
data['gdp_for_year ($)'] = data['gdp_for_year ($)'].str.replace(',', '')
data = data[data.year != 2016]
data


Explore the dataset.

In [None]:
data.info()

In [None]:
data['suicides_no'].sum()

In [None]:
data['gdp_for_year (¢)'] = pd.to_numeric(data['gdp_for_year ($)'], errors='coerce')

In [None]:
data.describe()

In [None]:
#Check for number of suicides per generation
generations=data[['generation', 'suicides_no']]
generations=generations.groupby('generation')['suicides_no'].sum().sort_values(ascending=False)
plt.figure(figsize=(5,5))
generations.plot.bar(color=['pink', 'orange' 'red', 'green', 'blue', 'cyan'])
plt.xlabel("Generation", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Generation", size=18)
plt.tight_layout()

In [None]:
#Check for number of suicides per age group
age=data[['age', 'suicides_no']]
age=age.groupby('age')['suicides_no'].sum().sort_values(ascending=False)
plt.figure(figsize=(5,5))
age.plot.bar(color=['pink', 'orange', 'red', 'green', 'blue', 'cyan'])
plt.xlabel("Age Group", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Age Group", size=18)
plt.tight_layout()

In [None]:
#Check for number of suicides per age group for Boomers
age=data[['age', 'suicides_no', 'generation']]
age=age[age['generation']=='Boomers']
age=age.groupby('age')['suicides_no'].sum().sort_values(ascending=False)
plt.figure(figsize=(5,5))
age.plot.bar(color=['pink', 'orange', 'red', 'green', 'blue', 'cyan'])
plt.xlabel("Age Group", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Age Group for Boomers", size=18)
plt.tight_layout()

In [None]:
#Check for number of suicides per generation for age group 35 - 54 years' old
generations=data[['generation', 'suicides_no', 'age']]
generations=generations[generations['age']=='35-54 years']
generations=generations.groupby('generation')['suicides_no'].sum().sort_values(ascending=False)
plt.figure(figsize=(5,5))
generations.plot.bar(color=['pink', 'orange' 'red', 'green', 'blue', 'cyan'])
plt.xlabel("Generation", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Generation for 35-54 years' old", size=18)
plt.tight_layout()

In [None]:
#Check for distribution of suicides between genders
gender=data[['sex', 'suicides_no']]
gender=gender.pivot_table(index='sex', values='suicides_no', aggfunc={sum})
plt.figure(figsize=(20,20))
gender.plot.bar(color=['yellow'])
plt.xlabel("Genders", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Gender", size=18)
plt.tight_layout()

In [None]:
#Check for number suicides commited by males and females within the different age groups.
age=data[['age', 'suicides_no','sex']]
age=age.pivot_table(index='age', values='suicides_no', columns='sex',aggfunc={sum})
plt.figure(figsize=(5,5))
age.plot.bar()
plt.xlabel("Age Group", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Age Group and Gender", size=18)
plt.tight_layout()

In [None]:
#Check for number suicides commited by males and females within the different generations.
generation=data[['generation', 'suicides_no','sex']]
generation=generation.pivot_table(index='generation', values='suicides_no', columns='sex',aggfunc={sum})
plt.figure(figsize=(5,5))
generation.plot.bar()
plt.xlabel("Generation", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Generation and Gender", size=18)
plt.tight_layout()

In [None]:
#Check for number of suicides per year.
year=data[['year', 'suicides_no']]
year=year.pivot_table(index='year', values='suicides_no', aggfunc={sum})
plt.figure(figsize=(40,60))
year.plot.bar()
plt.xlabel("Year", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Year", size=18)
plt.tight_layout()

In [None]:
#Check for number of suicides per year for males and females.
year=data[['year', 'suicides_no','sex']]
year=year.pivot_table(index='year', values='suicides_no', columns='sex',aggfunc={sum})
plt.figure(figsize=(5,5))
year.plot()
plt.xlabel("Year", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Year and Gender", size=18)
plt.tight_layout()

In [None]:
#check for top 30 countries in number of suicides
country=data[['country', 'suicides_no']]
country=country.groupby('country')['suicides_no'].sum().sort_values(ascending =False)
country=country.head(30)
plt.figure(figsize=(8,8))
country.plot(kind='barh')
plt.xlabel("Number of Suicides", size=15)
plt.ylabel("Top 30 Countries", size=15)
plt.title("Number of Suicides per country", size=18)
plt.tight_layout()

In [None]:
#check for top 30 countries in ratio of suicides
country=data[['country', 'suicides/100k pop']]
country=country.groupby('country')['suicides/100k pop'].mean().sort_values(ascending =False)
country=country.head(30)
plt.figure(figsize=(8,8))
country.plot(kind='barh')
plt.xlabel("Ratio of Suicides", size=15)
plt.ylabel("Top 30 Countries", size=15)
plt.title("Ratio of Suicides per country", size=18)
plt.tight_layout()

In [None]:
#Check for number of suicides per generation for the Top 5 countries.
top_5 = data
top_5 = data[data['country'].isin(['Russian Federation','United States','Japan','France', 'Ukraine'])]
top_5=top_5.pivot_table(values='suicides_no',columns='country',index='generation', aggfunc='sum')
top_5.plot(kind='barh')

plt.xlabel("Number of Suicides", size=15)
plt.ylabel("Generation", size=15)
plt.title("Number of Suicides per Generation for Top 5 Countries", size=18)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
#Check for ratio of suicides per generation for the Top 5 countries.
top_5 = data
top_5 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])]
top_5=top_5.pivot_table(values='suicides/100k pop',columns='country',index='generation', aggfunc='mean').sort_values(by= 'generation', ascending =False)
top_5.plot(kind='barh')
plt.xlabel("Ratio of Suicides", size=15)
plt.ylabel("Generation", size=15)
plt.title("Ratio of Suicides per Generation for Top 5 Countries", size=18)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
#Check for number of suicides per year for top 5 countries
top_5 = data
top_5 = data[data['country'].isin(['Russian Federation','United States','Japan','France', 'Ukraine'])]
top_5=top_5.pivot_table(values='suicides_no',columns='country',index='year', aggfunc='sum')
top_5.plot()
plt.xlabel("Year", size=15)
plt.ylabel("Number of Suicides", size=15)
plt.title("Number of Suicides per Year for Top 5 Countries", size=18)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
#Check for ratio of suicides per year for top 5 countries
top_5 = data
top_5 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])]
top_5=top_5.pivot_table(values='suicides/100k pop',columns='country',index='year', aggfunc='mean')
top_5.plot()
plt.xlabel("Year", size=15)
plt.ylabel("Ratio of Suicides", size=15)
plt.title("Ratio of Suicides per Year for Top 5 Countries", size=18)
plt.tight_layout()
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])
                         & (data['year'] >=1985) & (data['year'] <=1992)]

top_5_1=top_5_1.pivot_table(values='gdp_per_capita ($)',columns='country',index='year', aggfunc='mean')
top_5_1

In [None]:
#Check for GDP per Capita and Ratio of Suicides for top 5 between 1985 and 1992
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])
                         & (data['year'] >=1985) & (data['year'] <=1992)]
top_5_1=top_5_1.pivot_table(values='suicides/100k pop',columns='country',index='gdp_per_capita ($)', aggfunc='mean')
sns.scatterplot(data=top_5_1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("GDP per Capita for 1985-1992", size=15)
plt.ylabel("Ratio of Suicides", size=15)
plt.title("Comparison for Top 5 countries 1985-1992", size=18)
plt.show()


In [None]:
#Check for GDP per Capita and Ratio of Suicides for top 5 between 1993 and 2000
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])
                         & (data['year'] >=1993) & (data['year'] <=2000)]
top_5_1=top_5_1.pivot_table(values='suicides/100k pop',columns='country',index='gdp_per_capita ($)', aggfunc='mean')
sns.scatterplot(data=top_5_1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("GDP per Capita for 1993-2000", size=15)
plt.ylabel("Ratio of Suicides", size=15)
plt.title("Comparison for Top 5 countries 1993-2000", size=18)
plt.show()

In [None]:
#Check for GDP per Capita and Ratio of Suicides for top 5 between 2001 and 2009
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])
                         & (data['year'] >=2001) & (data['year'] <=2009)]
top_5_1=top_5_1.pivot_table(values='suicides/100k pop',columns='country',index='gdp_per_capita ($)', aggfunc='mean')
sns.scatterplot(data=top_5_1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("GDP per Capita for 2001-2009", size=15)
plt.ylabel("Ratio of Suicides", size=15)
plt.title("Comparison for Top 5 countries 2001-2009", size=18)
plt.show()

In [None]:
#Check for GDP per Capita and Ratio of Suicides for top 5 between 2010 and 2015
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])
                         & (data['year'] >=2010) & (data['year'] <=2015)]
top_5_1=top_5_1.pivot_table(values='suicides/100k pop',columns='country',index='gdp_per_capita ($)', aggfunc='mean')
sns.scatterplot(data=top_5_1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("GDP per Capita for 2010-2015", size=15)
plt.ylabel("Ratio of Suicides", size=15)
plt.title("Comparison for Top 5 countries 2010-2015", size=18)
plt.show()

In [None]:
#Check for GDP per Capita and Ratio of Suicides for top 5 
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])]

top_5_1=top_5_1.pivot_table(values='suicides/100k pop',columns='country',index='gdp_per_capita ($)', aggfunc='mean')
sns.scatterplot(data=top_5_1)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xlabel("GDP per Capita", size=15)
plt.ylabel("Ratio of Suicides", size=15)
plt.title("Comparison of GDP per Capita and Ratio of Suicides for Top 5 countries", size=18)
plt.show()

In [None]:
data_2 = data.pivot_table(values= 'suicides_no', index= 'year', aggfunc='sum')
data_2

In [None]:
#Check for correlations
corr = data.corr()
print(corr)

In [None]:
plt.figure(figsize=(25,20))
sns.heatmap(corr, annot=True)

In [None]:
#Run regression
X = data[['year', 'population', 'suicides/100k pop', 'gdp_per_capita ($)', 'gdp_for_year (¢)']]
Y = data['suicides_no'] 
model= sm.OLS(Y,X).fit()
predictions=model.predict(X)
pd.DataFrame({'observed':Y, 'predicted':predictions})

In [None]:
model.summary()

In [None]:
top_5_1 = data
top_5_1 = data[data['country'].isin(['Lithuania','Sri Lanka','Russian Federation','Hungary','Belarus'])]
corr_2= top_5_1.corr()
plt.figure(figsize=(25,20))
sns.heatmap(corr_2, annot=True)

In [None]:
X = top_5_1[['year', 'population', 'suicides/100k pop', 'gdp_per_capita ($)', 'gdp_for_year (¢)']]
Y = top_5_1['suicides_no']
model= sm.OLS(Y,X).fit()
predictions=model.predict(X)
pd.DataFrame({'observed':Y, 'predicted':predictions})

In [None]:
model.summary()

### Feedback Paolo

Eva, clear plots and well structured notebook, with good guiding comments. Also I think you chose a very relevant issue that it is not talked about often. I find insightful  your plots where you show the relashionship suicide/GDP. A few comments/questions for future work if you decide to come back to it
- What do you mean by `ratio of suicides`?  
- I would have liked to see at least an Introduction and a Conclusion in markdown. These two parts are very important for yourself in the future (without that it will be hard to remember why you did what you did) or for someone else who would want to work with you on this project. I know you changed topic at the last minute so you did not have time for this.
- What is your goal with the linear regression, what were you looking for?
- How could you get an idea visually of the goodness of your lin. reg. model?
- The correlations, do you see anything interesting there?