In [1]:
import pandas as pd
from collections import Counter

In [13]:
country_full = {
    'AT':'Austria', 'BE':'Belgium', 'BG':'Bulgaria', 'HR':'Croatia', 'CY': 'Cyprus', 
    'CZ':'Czech Republic', 'DK':'Denmark', 'EE':'Estonia','FI':'Finland', 'FR':'France',
    'DE':'Germany', 'GR':'Greece', 'HU':'Hungary', 'IE':'Ireland','IT':'Italy', 
    'LV':'Latvia', 'LT':'Lithuania','LU':'Luxembourg', 'MT':'Malta', 'NL':'Netherlands', 
    'PL':'Poland', 'PT':'Portugal', 'RO':'Romania', 'SK':'Slovakia', 'SI':'Slovenia',
    'ES':'Spain', 'SE':'Sweden'}

### country job vacancies

In [165]:
file = 'data/job_vacancies.tsv'
df = pd.read_table(file)

In [166]:
df.columns = [col.strip() for col in df.columns]

In [167]:
df[['s_adj','nace_r2','sizeclas','index', 'country']] = df[df.columns[0]].str.split(',', expand=True)
df = df.drop(df.columns[0], axis=1)

In [163]:
# replacing with full names of the countries
df['country'] = df['country'].replace(country_full)

In [168]:
df['country'].values

array(['AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'EA19', 'EA20', 'EE',
       'EL', 'ES', 'EU27_2020', 'FI', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT',
       'LU', 'LV', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'SE', 'SI',
       'SK', 'AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'EA19', 'EA20',
       'EE', 'EL', 'ES', 'EU27_2020', 'FI', 'HR', 'HU', 'IE', 'IS', 'IT',
       'LT', 'LU', 'LV', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'SE',
       'SI', 'SK'], dtype=object)

In [144]:
# leaving only the required countries
df = df[df['country'].isin(list(country_full.values()))]

In [145]:
# leaving only the year 2022 and the required columns
df = df[['2022Q1', '2022Q2','2022Q3','2022Q4','2023Q1', '2023Q2', 'index','country']]

In [146]:
# removing Job Vacancy because it has null values
df = df[(df['index']=='JOBRATE')]

In [147]:
columns_to_process = ['2022Q1', '2022Q2','2022Q3','2022Q4','2023Q1', '2023Q2']
for column in columns_to_process:
    df[column] = df[column].str.replace(' p', '').astype(float)

In [148]:
df = df.drop(columns=['index'])

In [149]:
df = df.sort_values(by='country')

In [150]:
df

Unnamed: 0,2022Q1,2022Q2,2022Q3,2022Q4,2023Q1,2023Q2,country
0,4.7,4.8,5.0,4.6,4.7,4.4,Austria
1,4.8,5.0,4.9,4.5,4.7,4.6,Belgium
2,0.9,0.9,0.8,0.8,0.9,0.8,Bulgaria
14,2.1,1.7,1.2,1.4,2.1,1.7,Croatia
4,3.2,2.7,2.2,1.9,2.8,2.9,Cyprus
5,5.3,4.9,4.5,4.2,3.8,3.7,Czech Republic
9,2.1,2.1,2.1,1.8,1.9,1.8,Estonia
13,3.9,3.0,2.5,2.0,3.3,2.2,Finland
6,4.1,4.5,4.3,4.4,4.1,4.1,Germany
15,2.7,3.0,2.8,2.5,2.6,2.5,Hungary


In [151]:
df.to_csv('data/job_vacancies.csv', index=False)

### country healthcare

In [25]:
file = 'data/country_healthcare.csv'
df = pd.read_csv(file)

In [26]:
df = df[df['name'].isin(list(country_full.values()))]

In [27]:
df

Unnamed: 0,skill_and_competency,cost,responsiveness_waitings,index_healthcare,speed,accuracy_and_completeness,friendliness_and_courtesy,modern_equipment,name,monthLastUpdate,location,contributors,yearLastUpdate,percentage_insurance_employer,percentage_insurance_private,percentage_insurance_public,percentage_insurance_none
0,1.113565,1.127796,0.444444,76.423305,1.016026,1.113333,0.838608,1.436893,Austria,9,1.29393,318,2023,13.207547,8.176101,77.044025,1.572327
1,1.155894,0.911538,0.532567,75.303587,0.908397,1.062016,0.938697,1.474903,Belgium,10,1.21374,263,2023,24.334601,20.532319,53.231939,1.901141
2,0.41039,0.46114,-0.288714,57.150219,0.412533,0.149051,-0.098958,0.112,Bulgaria,10,0.955497,389,2023,25.449871,9.768638,57.840617,6.940874
3,0.813896,1.022388,-0.45679,64.812469,0.064516,0.6075,0.57284,0.621554,Croatia,10,1.064198,408,2023,15.686275,7.352941,75.735294,1.22549
4,0.149606,0.28,-0.162602,56.007964,0.137097,0.108333,0.377953,0.358333,Cyprus,10,0.634146,128,2023,21.875,27.34375,37.5,13.28125
5,1.1939,1.419426,0.194323,75.371738,0.842451,0.948956,0.479212,1.298441,Czech Republic,10,1.337691,463,2023,13.174946,7.775378,77.537797,1.511879
6,1.074561,1.554545,0.292237,78.303079,0.65,1.013953,1.283186,1.533937,Denmark,10,1.232143,230,2023,7.826087,5.652174,78.26087,8.26087
7,1.081731,1.310345,0.11165,75.84366,0.786408,1.155,0.729469,1.485437,Estonia,9,1.333333,209,2023,7.177033,3.827751,80.382775,8.61244
8,1.192053,1.170068,0.42953,77.380389,0.886667,1.176471,0.996644,1.562712,Finland,10,1.272727,304,2023,19.078947,11.513158,56.578947,12.828947
9,1.30625,1.345912,0.454737,78.521358,1.018789,1.160944,0.880503,1.43617,France,10,1.318471,484,2023,29.752066,14.669421,53.099174,2.479339


In [28]:
df = df[['name','percentage_insurance_employer', 'percentage_insurance_private', 'percentage_insurance_public', 'percentage_insurance_none']]

In [30]:
df.columns

Index(['name', 'percentage_insurance_employer', 'percentage_insurance_private',
       'percentage_insurance_public', 'percentage_insurance_none'],
      dtype='object')

In [32]:
df = df.rename(columns={'percentage_insurance_employer':"by_employer", 'percentage_insurance_private':"private",
       'percentage_insurance_public':"by_public", 'percentage_insurance_none':"others"})

In [7]:
# Reshape the DataFrame using melt
df_melted = pd.melt(df, id_vars=['name'], var_name='insurance_source', value_name='proportion')

# Map the insurance sources to desired names
source_mapping = {
    'percentage_insurance_employer': 'employer',
    'percentage_insurance_private': 'private',
    'percentage_insurance_public': 'public',
    'percentage_insurance_none': 'others'
}

df_melted['insurance_source'] = df_melted['insurance_source'].map(source_mapping)

# Format the proportion column
df_melted['proportion'] = df_melted['proportion'].apply(lambda x: f'{x:.3f}')

# Print the result
print(df_melted)

         name insurance_source proportion
0     Austria         employer     13.208
1     Belgium         employer     24.335
2    Bulgaria         employer     25.450
3     Croatia         employer     15.686
4      Cyprus         employer     21.875
..        ...              ...        ...
103   Romania           others      5.855
104  Slovakia           others      1.807
105  Slovenia           others      1.081
106     Spain           others      6.342
107    Sweden           others      8.061

[108 rows x 3 columns]


In [8]:
df_melted

Unnamed: 0,name,insurance_source,proportion
0,Austria,employer,13.208
1,Belgium,employer,24.335
2,Bulgaria,employer,25.450
3,Croatia,employer,15.686
4,Cyprus,employer,21.875
...,...,...,...
103,Romania,others,5.855
104,Slovakia,others,1.807
105,Slovenia,others,1.081
106,Spain,others,6.342


In [23]:
df = df.iloc[:2]

In [33]:
df.to_csv('data/insurance_proportion.csv', index=False)