In [70]:
import numpy as np
import pandas as pd

from functools import reduce

import matplotlib.pyplot as plt

## Data Loading

Load all datasets. These may be varying in format, so we want to load and convert everything to one common format, in this case a single dataframe. 

In [71]:
DATA_PATH = 'data/'

datasets = []

#### Internet Usage Dataset

In [72]:
file_name = 'EU2019_internet_usage.csv'
internet_usage_df = pd.read_csv(f'{DATA_PATH}{file_name}', skiprows=1, header=None, names=['Country', 'internet_usage'])
datasets.append(internet_usage_df)

print(internet_usage_df)

        Country  internet_usage
0       Austria         87.7522
1       Belgium         90.2754
2      Bulgaria         67.9470
3        Cyprus         86.0636
4       Czechia         80.8669
5       Germany         88.1345
6       Denmark         98.0464
7         Spain         90.7187
8       Estonia         90.2289
9       Finland         89.6074
10       France         83.3397
11       Greece         75.6712
12      Croatia         79.0798
13      Hungary         80.3717
14      Ireland         87.0001
15        Italy         67.8507
16    Lithuania         81.5819
17   Luxembourg         97.1206
18       Latvia         86.1355
19        Malta         85.7786
20  Netherlands         93.2886
21       Poland         80.4359
22     Portugal         75.3464
23      Romania         73.6575
24     Slovenia         83.1084
25       Sweden         94.4934


#### Average Age of Leaving Parental Home Dataset

In [73]:
file_name = 'EU2019_avg_age_of_leaving_parental_home.csv'
avg_age_leaving_home_df = pd.read_csv(f'{DATA_PATH}{file_name}', skiprows=1, header=None, names=['Country', 'avg_age_leaving_home'])
datasets.append(avg_age_leaving_home_df)

print(avg_age_leaving_home_df)

        Country  avg_age_leaving_home
0       Austria                  25.4
1       Belgium                  25.2
2      Bulgaria                  30.0
3        Cyprus                  27.1
4       Czechia                  25.8
5       Germany                  23.7
6       Denmark                  21.1
8       Estonia                  22.6
10        Spain                  29.5
12      Finland                  21.8
13       France                  23.6
14      Croatia                  31.8
15      Hungary                  27.2
16      Ireland                  26.8
17        Italy                  30.1
18    Lithuania                  25.9
19   Luxembourg                  26.5
20       Latvia                  26.6
23        Malta                  29.3
24  Netherlands                  23.7
25       Poland                  27.1
26     Portugal                  29.0
27      Romania                  28.1
29       Sweden                  17.8
30     Slovenia                  27.7
31     Slova

#### Unemployment Rate Dataset

In [74]:
file_name = 'EU2019_unemployment_rate.csv'
unemployment_rate_df = pd.read_csv(f'{DATA_PATH}{file_name}', skiprows=1, header=None, names=['Country', 'unemployment_rate'], sep=';')
unemployment_rate_df = unemployment_rate_df[unemployment_rate_df['unemployment_rate'] != -99]
datasets.append(unemployment_rate_df)

print(unemployment_rate_df)

        Country  unemployment_rate
0        Greece          17.883330
1         Spain          14.108330
2         Italy           9.933333
3        France           8.433333
4        Sweden           6.908333
5       Finland           6.741667
6      Portugal           6.650000
7        Latvia           6.316667
8     Lithuania           6.283333
9      Slovakia           5.716667
10   Luxembourg           5.591667
11      Belgium           5.366667
12      Denmark           5.025000
13      Ireland           5.008333
14      Austria           4.841667
15      Estonia           4.483333
16     Slovenia           4.441667
17  Netherlands           4.433333
18  Switzerland           4.394452
19       Norway           3.875000
20      Hungary           3.250000
21       Poland           3.233333
22      Germany           2.975000
23      Czechia           2.016667


#### Gini Index Dataset

In [75]:
file_name = 'EU2019_Gini_Index.csv'
gini_index_df = pd.read_csv(f'{DATA_PATH}{file_name}')[['Country Name', 'Gini Index 2019']].set_axis(['Country', 'gini_index'], axis=1)
datasets.append(gini_index_df)

print(gini_index_df)

        Country  gini_index
0       Austria        30.2
1       Belgium        27.2
2      Bulgaria        40.3
3        Cyprus        31.2
4       Czechia        25.3
5       Germany        31.8
6       Denmark        27.7
7         Spain        34.3
8       Estonia        30.8
9       Finland        27.7
10       France        31.2
11       Greece        33.1
12      Croatia        28.9
13      Hungary        30.0
14      Ireland        30.8
15        Italy        34.6
16    Lithuania        35.3
17   Luxembourg        34.2
18       Latvia        34.5
19        Malta        31.0
20  Netherlands        29.2
21       Poland        28.8
22     Portugal        32.8
23      Romania        34.8
24     Slovakia        23.2
25     Slovenia        24.4
26       Sweden        29.3


#### Suicide Rate Dataset

In [76]:
file_name = 'EU2019_suicide_rates.csv'
suicide_rate_df = pd.read_csv(f'{DATA_PATH}{file_name}', skiprows=1, header=None, names=['Country', 'suicide_rate'])
datasets.append(suicide_rate_df)

print(suicide_rate_df)

        Country  suicide_rate
0       Ireland        9.6200
1        Greece        5.0900
2      Slovenia       19.7829
3        France       13.7586
4     Lithuania       26.1436
5        Poland       11.3017
6    Luxembourg       11.2655
7       Hungary       16.6483
8       Belgium       18.2944
9       Croatia       16.3697
10     Portugal       11.4600
11  Netherlands       11.8436
12      Austria       14.5966
13       Latvia       20.1248
14       Sweden       14.7410
15      Estonia       14.8911
16     Bulgaria        9.7300
17      Germany       12.3139
18        Spain        7.7200
19      Denmark       10.6956
20      Romania        9.7400
21     Slovakia       12.0899
22        Italy        6.6800
23      Finland       15.2946
24       Cyprus        3.6400
25        Malta        6.0600
26      Czechia       12.1774


### Combine datasets

In [78]:
data_df = reduce(lambda l, r: pd.merge(l, r, on='Country', how='outer'), datasets)

print(data_df.shape)
print(data_df)

(29, 6)
        Country  internet_usage  avg_age_leaving_home  unemployment_rate  \
0       Austria         87.7522                  25.4           4.841667   
1       Belgium         90.2754                  25.2           5.366667   
2      Bulgaria         67.9470                  30.0                NaN   
3       Croatia         79.0798                  31.8                NaN   
4        Cyprus         86.0636                  27.1                NaN   
5       Czechia         80.8669                  25.8           2.016667   
6       Denmark         98.0464                  21.1           5.025000   
7       Estonia         90.2289                  22.6           4.483333   
8       Finland         89.6074                  21.8           6.741667   
9        France         83.3397                  23.6           8.433333   
10      Germany         88.1345                  23.7           2.975000   
11       Greece         75.6712                   NaN          17.883330   
12  

## Data Pre-Processing