In [1]:
import pandas as pd
import pycountry_convert as pc
from pycountry import countries

## Model
suicide ~ population \
suicide ~ population + gdp_per_capita \
group by country (or year) (or age) suicide ~ population + gdp_per_capita

In [2]:
suicide_df = pd.read_csv('data/suicide.csv')
suicide_df = suicide_df.rename(columns={'suicides_no':'suicide', ' gdp_for_year ($) ':'total_gdp', 'gdp_per_capita ($)':'gdp_per_capita', 'HDI for year':'hdi'})
suicide_df = suicide_df.drop(columns=['suicides/100k pop', 'hdi', 'country-year', 'total_gdp', 'generation'])
suicide_df      

Unnamed: 0,country,year,sex,age,suicide,population,gdp_per_capita
0,Albania,1987,male,15-24 years,21,312900,796
1,Albania,1987,male,35-54 years,16,308000,796
2,Albania,1987,female,15-24 years,14,289700,796
3,Albania,1987,male,75+ years,1,21800,796
4,Albania,1987,male,25-34 years,9,274300,796
...,...,...,...,...,...,...,...
27815,Uzbekistan,2014,female,35-54 years,107,3620833,2309
27816,Uzbekistan,2014,female,75+ years,9,348465,2309
27817,Uzbekistan,2014,male,5-14 years,60,2762158,2309
27818,Uzbekistan,2014,female,5-14 years,44,2631600,2309


## Get the rows belong to European countries

In [3]:
# Get the country names since some countries are not recognized by pycountry
countries_name_list = [country.name for country in list(countries)]

# Function to extract continent code
def get_continent(country):
    country_code = pc.country_name_to_country_alpha2(country)
    return pc.country_alpha2_to_continent_code(country_code)

suicide_df = suicide_df[suicide_df['country'].isin(countries_name_list)]   # Filter out invalid countries
suicide_df['continent'] = suicide_df['country'].apply(get_continent)       # Extract continent code
suicide_df = suicide_df[suicide_df['continent'] == 'EU']                   # Get the countries in EU
suicide_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  suicide_df['continent'] = suicide_df['country'].apply(get_continent)       # Extract continent code


Unnamed: 0,country,year,sex,age,suicide,population,gdp_per_capita,continent
0,Albania,1987,male,15-24 years,21,312900,796,EU
1,Albania,1987,male,35-54 years,16,308000,796,EU
2,Albania,1987,female,15-24 years,14,289700,796,EU
3,Albania,1987,male,75+ years,1,21800,796,EU
4,Albania,1987,male,25-34 years,9,274300,796,EU


## Group data by year and country

In [4]:
suicide_df_grouped = suicide_df.groupby(['year', 'country']).aggregate({
    'suicide':'sum', 'population':'sum', 'gdp_per_capita':'mean'})
df = suicide_df_grouped.unstack().reset_index()
df = df[(1991 <= df['year']) & (df['year'] <= 2016)].dropna(axis=1).set_index('year').stack()
df


Unnamed: 0_level_0,Unnamed: 1_level_0,gdp_per_capita,population,suicide
year,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1991,Austria,23808.0,7299728.0,1769.0
1991,Hungary,3561.0,9757766.0,3993.0
1991,Iceland,28857.0,235900.0,37.0
1991,Netherlands,22906.0,14114800.0,1611.0
1991,Romania,1351.0,21458000.0,2161.0
...,...,...,...,...
2016,Hungary,13448.0,8390370.0,1761.0
2016,Iceland,64708.0,268773.0,40.0
2016,Netherlands,48108.0,14239554.0,1886.0
2016,Romania,10020.0,16644905.0,1953.0


## Generate new columns

-   The dataframe is tranposed and stacked so that it have a better form
-   Columns `suicides_per_100k` and `gdp_per_capita` are generated using these data
-   The dataframe is exported to a csv file for future processes

In [5]:
output_df = df.copy()
output_df['suicides_per_100k'] = (output_df['suicide']/output_df['population']) * 100000
output_df.to_csv("data/suicide_cleaned.csv")