In [27]:
import pandas as pd

**Employment by age and economic acitivity**

---

The following format is defined:

* geo — analyzed country
* In thousand persons — number of workers in the given sector (in thousands)
* Sex — gender
* Sector — area of employment
* workers_per_population — percentage of the total population working in the given sector


In [28]:
df = pd.read_csv('../../data/employment_by_sex_and_economic_activity.csv')


In [29]:
df.columns

Index(['STRUCTURE', 'STRUCTURE_ID', 'STRUCTURE_NAME', 'freq', 'Time frequency',
       'unit', 'Unit of measure', 'sex', 'Sex', 'age', 'Age class', 'nace_r2',
       'Statistical classification of economic activities in the European Community (NACE Rev. 2)',
       'geo', 'Geopolitical entity (reporting)', 'TIME_PERIOD', 'Time',
       'OBS_VALUE', 'Observation value', 'OBS_FLAG',
       'Observation status (Flag) V2 structure', 'CONF_STATUS',
       'Confidentiality status (flag)'],
      dtype='object')

In [30]:
df = df[["Geopolitical entity (reporting)", "OBS_VALUE", "TIME_PERIOD", "Sex", "Statistical classification of economic activities in the European Community (NACE Rev. 2)"]]


In [31]:
df.head()

Unnamed: 0,Geopolitical entity (reporting),OBS_VALUE,TIME_PERIOD,Sex,Statistical classification of economic activities in the European Community (NACE Rev. 2)
0,Austria,85.3,2008,Females,"Agriculture, forestry and fishing"
1,Austria,85.1,2009,Females,"Agriculture, forestry and fishing"
2,Austria,85.0,2010,Females,"Agriculture, forestry and fishing"
3,Austria,79.6,2011,Females,"Agriculture, forestry and fishing"
4,Austria,72.5,2012,Females,"Agriculture, forestry and fishing"


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36029 entries, 0 to 36028
Data columns (total 5 columns):
 #   Column                                                                                     Non-Null Count  Dtype  
---  ------                                                                                     --------------  -----  
 0   Geopolitical entity (reporting)                                                            36029 non-null  object 
 1   OBS_VALUE                                                                                  33746 non-null  float64
 2   TIME_PERIOD                                                                                36029 non-null  int64  
 3   Sex                                                                                        36029 non-null  object 
 4   Statistical classification of economic activities in the European Community (NACE Rev. 2)  36029 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage

In [33]:
df.rename(columns={'Geopolitical entity (reporting)': 'geo', 'OBS_VALUE': 'In thousand persons', 'TIME_PERIOD': 'year', 'Statistical classification of economic activities in the European Community (NACE Rev. 2)': 'Sector'}, inplace=True)

In [34]:
df[~df['In thousand persons'].isna()]\
    .groupby('geo')\
    .size()\
    .sort_values(ascending=False)


geo
Italy                     1071
Spain                     1067
France                    1061
Switzerland               1052
Czechia                   1051
Germany                   1049
Belgium                   1043
Austria                   1027
Greece                    1026
Romania                   1015
Netherlands               1007
Cyprus                    1007
Türkiye                   1007
Poland                    1006
Bulgaria                   998
Finland                    994
Croatia                    994
Denmark                    985
Portugal                   978
Norway                     977
Hungary                    970
Sweden                     967
Slovakia                   965
Ireland                    958
Latvia                     955
Slovenia                   954
Lithuania                  938
Estonia                    935
Luxembourg                 934
Serbia                     897
Malta                      893
Iceland                    865
Unit

In [35]:
def fill_neighbors_only(group):
    vals = group['In thousand persons'].copy()
    for i in range(1, len(vals)-1):
        if pd.isna(vals.iloc[i]) and pd.notna(vals.iloc[i-1]) and pd.notna(vals.iloc[i+1]):
            vals.iloc[i] = (vals.iloc[i-1] + vals.iloc[i+1]) / 2
    group['In thousand persons'] = vals
    return group

df = df.sort_values(['geo', 'Sex', 'year'])

df = df.groupby(['geo', 'Sex'], group_keys=False).apply(fill_neighbors_only)


  df = df.groupby(['geo', 'Sex'], group_keys=False).apply(fill_neighbors_only)


In [36]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 36029 entries, 0 to 36028
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   geo                  36029 non-null  object 
 1   In thousand persons  35012 non-null  float64
 2   year                 36029 non-null  int64  
 3   Sex                  36029 non-null  object 
 4   Sector               36029 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 1.6+ MB


In [37]:
df[df['In thousand persons'].isna()]\
    .groupby('Sex')\
    .size()\
    .sort_values(ascending=False)


Sex
Females    425
Males      405
Total      187
dtype: int64

usunięcie nulli tam gdzie Sex nie Total

In [38]:
df = df[~((df['Sex'] != 'Total') & df.isna().any(axis=1))]

In [39]:
df[df['In thousand persons'].isna()]\
    .groupby('year')\
    .size()\
    .sort_values(ascending=False)


year
2024    29
2023    18
2022    18
2021    16
2020    14
2019    14
2015    10
2008     8
2009     8
2011     8
2012     8
2017     8
2016     8
2018     8
2010     6
2013     4
2014     2
dtype: int64

In [40]:
df.dropna(inplace=True)

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35012 entries, 0 to 36028
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   geo                  35012 non-null  object 
 1   In thousand persons  35012 non-null  float64
 2   year                 35012 non-null  int64  
 3   Sex                  35012 non-null  object 
 4   Sector               35012 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 1.6+ MB


In [42]:
pop_cit = pd.read_csv('../../processed_data/country_population.csv')

In [43]:
merged = pd.merge(df, pop_cit, on=['geo', 'year'], how='left')
merged.head()

Unnamed: 0,geo,In thousand persons,year,Sex,Sector,Population number
0,Austria,85.3,2008,Females,"Agriculture, forestry and fishing",8307989.0
1,Austria,122.55,2008,Females,Mining and quarrying,8307989.0
2,Austria,159.8,2008,Females,Manufacturing,8307989.0
3,Austria,4.3,2008,Females,"Electricity, gas, steam and air conditioning s...",8307989.0
4,Austria,4.1,2008,Females,"Water supply; sewerage, waste management and r...",8307989.0


In [44]:
merged['workers_per_population'] = merged['In thousand persons'] * 1000 / merged['Population number'] * 100

In [45]:
merged.head()

Unnamed: 0,geo,In thousand persons,year,Sex,Sector,Population number,workers_per_population
0,Austria,85.3,2008,Females,"Agriculture, forestry and fishing",8307989.0,1.026723
1,Austria,122.55,2008,Females,Mining and quarrying,8307989.0,1.475086
2,Austria,159.8,2008,Females,Manufacturing,8307989.0,1.92345
3,Austria,4.3,2008,Females,"Electricity, gas, steam and air conditioning s...",8307989.0,0.051757
4,Austria,4.1,2008,Females,"Water supply; sewerage, waste management and r...",8307989.0,0.04935


In [46]:
merged.drop(columns=['Population number'], inplace=True)

In [47]:
merged.to_csv("../../processed_data/employmnet_economic_activity.csv", index=False)