In [1]:
import pandas as pd

**Unemployment rates by citizenship and sex**

---

Unemployment rate (Eurostat) is calculated according to the International Labour Organization (ILO) definition as:

Unemployment rate = (Number of unemployed people / Labour force )× 100 %

Unemployed people: Those who are not working, actively seeking work in the last 4 weeks, and available to start within 2 weeks.

Labour force: Sum of employed and unemployed people.

In [18]:
df = pd.read_csv('../../data/unemployment_rates_by_sex_and_citizenship.csv')

In [19]:
df.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,freq,Time frequency,unit,Unit of measure,sex,Sex,age,...,geo,Geopolitical entity (reporting),TIME_PERIOD,Time,OBS_VALUE,Observation value,OBS_FLAG,Observation status (Flag) V2 structure,CONF_STATUS,Confidentiality status (flag)
0,dataflow,ESTAT:LFSQ_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1998-Q1,,8.9,,,,,
1,dataflow,ESTAT:LFSQ_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1999-Q1,,9.2,,,,,
2,dataflow,ESTAT:LFSQ_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,2000-Q1,,9.0,,,,,
3,dataflow,ESTAT:LFSQ_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,2000-Q2,,5.7,,u,low reliability,,
4,dataflow,ESTAT:LFSQ_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",Q,Quarterly,PC,Percentage,F,Females,Y15-64,...,AT,Austria,2001-Q1,,7.9,,,,,


In [20]:
df = df[["Country of citizenship", "Geopolitical entity (reporting)", "OBS_VALUE", "TIME_PERIOD", "Sex"]]


In [21]:
df.head()

Unnamed: 0,Country of citizenship,Geopolitical entity (reporting),OBS_VALUE,TIME_PERIOD,Sex
0,Foreign country,Austria,8.9,1998-Q1,Females
1,Foreign country,Austria,9.2,1999-Q1,Females
2,Foreign country,Austria,9.0,2000-Q1,Females
3,Foreign country,Austria,5.7,2000-Q2,Females
4,Foreign country,Austria,7.9,2001-Q1,Females


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28473 entries, 0 to 28472
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country of citizenship           28473 non-null  object 
 1   Geopolitical entity (reporting)  28473 non-null  object 
 2   OBS_VALUE                        25355 non-null  float64
 3   TIME_PERIOD                      28473 non-null  object 
 4   Sex                              28473 non-null  object 
dtypes: float64(1), object(4)
memory usage: 1.1+ MB


In [23]:
df = df.rename(columns={
    "Geopolitical entity (reporting)": "geo",
    "TIME_PERIOD": "quarter",
    "OBS_VALUE": "unemployment_rate", 
    "Country of citizenship": "citizenship"
})

In [25]:
def quarter_to_datetime(q):
    year, qtr = q[:4], q[-1]
    month = {'1': '01', '2': '04', '3': '07', '4': '10'}.get(qtr, '01')
    return pd.to_datetime(f"{year}-{month}-01")

df['date'] = df['quarter'].apply(quarter_to_datetime)

df = df.sort_values(by=['geo', 'citizenship', 'Sex', 'date'])
df['unemployment_rate'] = df.groupby(['geo', 'citizenship', 'Sex'])['unemployment_rate']\
                                 .transform(lambda group: group.interpolate(method='linear'))

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28473 entries, 0 to 28472
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   citizenship        28473 non-null  object        
 1   geo                28473 non-null  object        
 2   unemployment_rate  26096 non-null  float64       
 3   quarter            28473 non-null  object        
 4   Sex                28473 non-null  object        
 5   date               28473 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(4)
memory usage: 1.5+ MB


In [27]:
df.dropna(inplace=True)

In [28]:
df.to_csv("../../processed_data/unemploymnet_rates.csv", index=False)

**Unemployment rates by citizenship and sex**

---

Yearly

In [2]:
an_df = pd.read_csv("../../data/unemployment_rates_yearly.csv")


In [3]:
an_df.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,freq,Time frequency,unit,Unit of measure,sex,Sex,age,...,geo,Geopolitical entity (reporting),TIME_PERIOD,Time,OBS_VALUE,Observation value,OBS_FLAG,Observation status (Flag) V2 structure,CONF_STATUS,Confidentiality status (flag)
0,dataflow,ESTAT:LFSA_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1995,,7.8,,,,,
1,dataflow,ESTAT:LFSA_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1996,,9.4,,,,,
2,dataflow,ESTAT:LFSA_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1997,,9.4,,,,,
3,dataflow,ESTAT:LFSA_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1998,,8.9,,,,,
4,dataflow,ESTAT:LFSA_URGAN(1.0),"Unemployment rates by sex, age and citizenship...",A,Annual,PC,Percentage,F,Females,Y15-64,...,AT,Austria,1999,,9.2,,,,,


In [4]:
an_df = an_df[["Country of citizenship", "Geopolitical entity (reporting)", "OBS_VALUE", "TIME_PERIOD", "Sex"]]

In [5]:
an_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7983 entries, 0 to 7982
Data columns (total 5 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Country of citizenship           7983 non-null   object 
 1   Geopolitical entity (reporting)  7983 non-null   object 
 2   OBS_VALUE                        7253 non-null   float64
 3   TIME_PERIOD                      7983 non-null   int64  
 4   Sex                              7983 non-null   object 
dtypes: float64(1), int64(1), object(3)
memory usage: 312.0+ KB


In [6]:
an_df = an_df.rename(columns={
    "Geopolitical entity (reporting)": "geo",
    "TIME_PERIOD": "year",
    "OBS_VALUE": "employment_rate", 
    "Country of citizenship": "citizenship"
})


In [9]:
an_df[(an_df['Sex'] == 'Total') & an_df['employment_rate'].isna()]

Unnamed: 0,citizenship,geo,employment_rate,year,Sex
5352,Foreign country,Bosnia and Herzegovina,,2021,Total
5353,Foreign country,Bosnia and Herzegovina,,2022,Total
5354,Foreign country,Bosnia and Herzegovina,,2023,Total
5355,Foreign country,Bosnia and Herzegovina,,2024,Total
5386,Foreign country,Bulgaria,,2001,Total
...,...,...,...,...,...
6151,Foreign country,Slovakia,,2022,Total
6152,Foreign country,Slovakia,,2023,Total
6153,Foreign country,Slovakia,,2024,Total
6296,Reporting country,Cyprus,,1999,Total


In [None]:
selected_countries = ["Poland", "Bulgaria", "Romania", "Hungary", "France", "Italy", "Germany", "Sweden", "Spain"]
temp = an_df[(an_df['geo'].isin(selected_countries)) & (an_df['year'] >= 2005)]

In [12]:
temp[(temp['citizenship'] == 'Foreign country') & (temp['geo'] == 'Bulgaria')]


Unnamed: 0,citizenship,geo,employment_rate,year,Sex
68,Foreign country,Bulgaria,,2005,Females
69,Foreign country,Bulgaria,,2006,Females
70,Foreign country,Bulgaria,,2007,Females
71,Foreign country,Bulgaria,,2008,Females
72,Foreign country,Bulgaria,,2009,Females
73,Foreign country,Bulgaria,,2010,Females
74,Foreign country,Bulgaria,,2011,Females
75,Foreign country,Bulgaria,,2012,Females
76,Foreign country,Bulgaria,,2013,Females
77,Foreign country,Bulgaria,,2014,Females


In [13]:
temp[(temp['citizenship'] == 'Foreign country') & (temp['geo'] == 'Romania')]


Unnamed: 0,citizenship,geo,employment_rate,year,Sex
724,Foreign country,Romania,,2005,Females
725,Foreign country,Romania,,2006,Females
726,Foreign country,Romania,,2007,Females
727,Foreign country,Romania,,2008,Females
728,Foreign country,Romania,,2009,Females
729,Foreign country,Romania,,2010,Females
730,Foreign country,Romania,,2011,Females
731,Foreign country,Romania,,2012,Females
732,Foreign country,Romania,,2013,Females
733,Foreign country,Romania,,2014,Females


In [14]:
temp[(temp['citizenship'] == 'Foreign country') & (temp['geo'] == 'Poland')]


Unnamed: 0,citizenship,geo,employment_rate,year,Sex
677,Foreign country,Poland,,2005,Females
678,Foreign country,Poland,,2006,Females
679,Foreign country,Poland,,2007,Females
680,Foreign country,Poland,,2008,Females
681,Foreign country,Poland,,2009,Females
682,Foreign country,Poland,,2010,Females
683,Foreign country,Poland,,2011,Females
684,Foreign country,Poland,,2012,Females
685,Foreign country,Poland,,2013,Females
686,Foreign country,Poland,,2014,Females


In [15]:
temp[(temp['citizenship'] == 'Foreign country') & (temp['geo'] == 'Hungary')]


Unnamed: 0,citizenship,geo,employment_rate,year,Sex
401,Foreign country,Hungary,,2005,Females
402,Foreign country,Hungary,,2006,Females
403,Foreign country,Hungary,,2007,Females
404,Foreign country,Hungary,,2008,Females
405,Foreign country,Hungary,,2009,Females
406,Foreign country,Hungary,,2010,Females
407,Foreign country,Hungary,,2011,Females
408,Foreign country,Hungary,,2012,Females
409,Foreign country,Hungary,,2013,Females
410,Foreign country,Hungary,,2014,Females


Tych których brakowało w danych kwartalnych - brakuje również w danych rocznych

In [16]:
an_df.dropna(inplace=True)

In [18]:
an_df.to_csv("../../processed_data/unemploymnet_rates_yearly.csv", index=False)