In [71]:
import pandas as pd

**Employment rates by citizenship and sex**

---

EMPLOYMENT RATE - statistical ratio that measures the proportion of a country's working age population (statistics are often given for ages 15 to 64)
Citizenship: Foreign country, Reporting country, Total
Sex: Female, male, Total 

In [72]:
df = pd.read_csv('../../data/employment_rates_by_sex_and_citizenship.csv')

In [73]:
df.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,freq,Time frequency,unit,Unit of measure,sex,Sex,age,...,geo,Geopolitical entity (reporting),TIME_PERIOD,Time,OBS_VALUE,Observation value,OBS_FLAG,Observation status (Flag) V2 structure,CONF_STATUS,Confidentiality status (flag)
0,dataflow,ESTAT:LFSQ_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1998-Q1,,57.6,,,,,
1,dataflow,ESTAT:LFSQ_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1999-Q1,,55.7,,,,,
2,dataflow,ESTAT:LFSQ_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,2000-Q1,,58.6,,,,,
3,dataflow,ESTAT:LFSQ_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,2000-Q2,,57.5,,,,,
4,dataflow,ESTAT:LFSQ_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,2001-Q1,,57.4,,,,,


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28473 entries, 0 to 28472
Data columns (total 23 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   STRUCTURE                               28473 non-null  object 
 1   STRUCTURE_ID                            28473 non-null  object 
 2   STRUCTURE_NAME                          28473 non-null  object 
 3   freq                                    28473 non-null  object 
 4   Time frequency                          28473 non-null  object 
 5   unit                                    28473 non-null  object 
 6   Unit of measure                         28473 non-null  object 
 7   sex                                     28473 non-null  object 
 8   Sex                                     28473 non-null  object 
 9   age                                     28473 non-null  object 
 10  Age class                               28473 non-null  ob

In [75]:
df = df[["Country of citizenship", "Geopolitical entity (reporting)", "OBS_VALUE", "TIME_PERIOD", "Sex"]]


In [76]:
df.head()

Unnamed: 0,Country of citizenship,Geopolitical entity (reporting),OBS_VALUE,TIME_PERIOD,Sex
0,Foreign country,Austria,57.6,1998-Q1,Females
1,Foreign country,Austria,55.7,1999-Q1,Females
2,Foreign country,Austria,58.6,2000-Q1,Females
3,Foreign country,Austria,57.5,2000-Q2,Females
4,Foreign country,Austria,57.4,2001-Q1,Females


In [77]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28473 entries, 0 to 28472
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country of citizenship           28473 non-null  object 
 1   Geopolitical entity (reporting)  28473 non-null  object 
 2   OBS_VALUE                        27267 non-null  float64
 3   TIME_PERIOD                      28473 non-null  object 
 4   Sex                              28473 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.1+ MB


In [78]:
df = df.rename(columns={
    "Geopolitical entity (reporting)": "geo",
    "TIME_PERIOD": "quarter",
    "OBS_VALUE": "employment_rate", 
    "Country of citizenship": "citizenship"
})

In [79]:
def quarter_to_datetime(q):
    year, qtr = q[:4], q[-1]
    month = {'1': '01', '2': '04', '3': '07', '4': '10'}.get(qtr, '01')
    return pd.to_datetime(f"{year}-{month}-01")

df['date'] = df['quarter'].apply(quarter_to_datetime)

df = df.sort_values(by=['geo', 'citizenship', 'Sex', 'date'])
df['employment_rate'] = df.groupby(['geo', 'citizenship', 'Sex'])['employment_rate']\
                                 .transform(lambda group: group.interpolate(method='linear').ffill().bfill())

In [80]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28473 entries, 0 to 28472
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   citizenship      28473 non-null  object        
 1   geo              28473 non-null  object        
 2   employment_rate  28333 non-null  float64       
 3   quarter          28473 non-null  object        
 4   Sex              28473 non-null  object        
 5   date             28473 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 1.5+ MB


In [81]:
df.dropna(inplace=True)

In [82]:
df.head()

Unnamed: 0,citizenship,geo,employment_rate,quarter,Sex,date
0,Foreign country,Austria,57.6,1998-Q1,Females,1998-01-01
1,Foreign country,Austria,55.7,1999-Q1,Females,1999-01-01
2,Foreign country,Austria,58.6,2000-Q1,Females,2000-01-01
3,Foreign country,Austria,57.5,2000-Q2,Females,2000-04-01
4,Foreign country,Austria,57.4,2001-Q1,Females,2001-01-01


In [83]:
df.to_csv("../../processed_data/employmnet_rates.csv", index=False)

**Employment rates by citizenship and sex**

---
Annual data

In [85]:
an_df = pd.read_csv("../../data/employment_rates_yearly.csv")

In [86]:
an_df.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,freq,Time frequency,unit,Unit of measure,sex,Sex,age,...,geo,Geopolitical entity (reporting),TIME_PERIOD,Time,OBS_VALUE,Observation value,OBS_FLAG,Observation status (Flag) V2 structure,CONF_STATUS,Confidentiality status (flag)
0,dataflow,ESTAT:LFSA_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1995,,59.3,,,,,
1,dataflow,ESTAT:LFSA_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1996,,55.4,,,,,
2,dataflow,ESTAT:LFSA_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1997,,57.7,,,,,
3,dataflow,ESTAT:LFSA_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1998,,57.6,,,,,
4,dataflow,ESTAT:LFSA_ERGAN(1.0),"Employment rates by sex, age and citizenship (%)",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1999,,55.7,,,,,


In [87]:
an_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7983 entries, 0 to 7982
Data columns (total 23 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   STRUCTURE                               7983 non-null   object 
 1   STRUCTURE_ID                            7983 non-null   object 
 2   STRUCTURE_NAME                          7983 non-null   object 
 3   freq                                    7983 non-null   object 
 4   Time frequency                          7983 non-null   object 
 5   unit                                    7983 non-null   object 
 6   Unit of measure                         7983 non-null   object 
 7   sex                                     7983 non-null   object 
 8   Sex                                     7983 non-null   object 
 9   age                                     7983 non-null   object 
 10  Age class                               7983 non-null   obje

In [88]:
an_df = an_df[["Country of citizenship", "Geopolitical entity (reporting)", "OBS_VALUE", "TIME_PERIOD", "Sex"]]


In [89]:
an_df.isna().sum()

Country of citizenship               0
Geopolitical entity (reporting)      0
OBS_VALUE                          188
TIME_PERIOD                          0
Sex                                  0
dtype: int64

In [90]:
an_df.head()

Unnamed: 0,Country of citizenship,Geopolitical entity (reporting),OBS_VALUE,TIME_PERIOD,Sex
0,Foreign country,Austria,59.3,1995,Females
1,Foreign country,Austria,55.4,1996,Females
2,Foreign country,Austria,57.7,1997,Females
3,Foreign country,Austria,57.6,1998,Females
4,Foreign country,Austria,55.7,1999,Females


In [91]:
an_df = an_df.rename(columns={
    "Geopolitical entity (reporting)": "geo",
    "TIME_PERIOD": "year",
    "OBS_VALUE": "employment_rate", 
    "Country of citizenship": "citizenship"
})

In [93]:
an_df = an_df.sort_values(by=['geo', 'citizenship', 'Sex', 'year'])
an_df['employment_rate'] = an_df.groupby(['geo', 'citizenship', 'Sex'])['employment_rate']\
                                 .transform(lambda group: group.interpolate(method='linear').ffill().bfill())

In [94]:
an_df.isna().sum()

citizenship         0
geo                 0
employment_rate    39
year                0
Sex                 0
dtype: int64

In [95]:
selected_countries = ["Poland", "Bulgaria", "Romania", "Hungary", "France", "Italy", "Germany", "Sweden", "Spain"]
temp = an_df[(an_df['geo'].isin(selected_countries)) & (an_df['year'] >= 2005)]

In [96]:
temp[(temp['citizenship'] == 'Foreign country') & (temp['geo'] == 'Bulgaria')]

Unnamed: 0,citizenship,geo,employment_rate,year,Sex
68,Foreign country,Bulgaria,,2005,Females
69,Foreign country,Bulgaria,,2006,Females
70,Foreign country,Bulgaria,,2007,Females
71,Foreign country,Bulgaria,,2008,Females
72,Foreign country,Bulgaria,,2009,Females
73,Foreign country,Bulgaria,,2010,Females
74,Foreign country,Bulgaria,,2011,Females
75,Foreign country,Bulgaria,,2012,Females
76,Foreign country,Bulgaria,,2013,Females
77,Foreign country,Bulgaria,,2014,Females


Dla Sex == Total są dane dla prawie wszystkich lat 

In [97]:
temp[(temp['citizenship'] == 'Foreign country') & (temp['geo'] == 'Romania')]

Unnamed: 0,citizenship,geo,employment_rate,year,Sex
724,Foreign country,Romania,56.5,2005,Females
725,Foreign country,Romania,56.5,2006,Females
726,Foreign country,Romania,56.5,2007,Females
727,Foreign country,Romania,56.5,2008,Females
728,Foreign country,Romania,56.5,2009,Females
729,Foreign country,Romania,56.5,2010,Females
730,Foreign country,Romania,56.5,2011,Females
731,Foreign country,Romania,56.5,2012,Females
732,Foreign country,Romania,56.5,2013,Females
733,Foreign country,Romania,56.5,2014,Females


In [98]:
an_df.to_csv("../../processed_data/employmnet_rates_yearly.csv", index=False)