In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry

Cel: Przygotowanie danych dotyczących migracji
* Przygotowanie datasetu używanego do analiz korelacji pewnych współczynników z liczbą migrantów:
    * Porównanie informacji POPULATION BY CITIZENSHIP, ALL VALID PERMITS, MIGRATION BY CITIZENSHIP
    * Dodanie do wyjściowego datasetu informacji na temat Ukraińskich uchodźcach
    * Dodanie informacji dotyczących pobytów tymczasowych

# POPULATION BY CITIZENSHIP

selected_countries = ["Poland", "Bulgaria", "Romania", "Hungary", "France", "Italy", "Germany", "Sweden", "Spain", "Greece", "Slovenia"]


Skondensowana tabela przedstawia dane dotyczące ilości populacji w wybranych krajach w następującym formacie:
* NAZWA KRAJU 
* ROK 
* LICZBA OBYWATELI Z UE 
* LICZBA OBYWATELI Z DANEGO KRAJU
* ŁĄCZNA LICZBA OBYWATELI OBCOKRAJOWCÓW

In [55]:
pop_cit = pd.read_csv('../processed_data/population_by_citizenship.csv')

In [56]:
pop_cit.head()

Unnamed: 0,citizen,geo,year,number of citizens in country
0,Andorra,Austria,2015,4
1,Andorra,Austria,2016,3
2,Andorra,Austria,2017,4
3,Andorra,Austria,2018,5
4,Andorra,Austria,2019,4


In [57]:
pop_cit.head()

Unnamed: 0,citizen,geo,year,number of citizens in country
0,Andorra,Austria,2015,4
1,Andorra,Austria,2016,3
2,Andorra,Austria,2017,4
3,Andorra,Austria,2018,5
4,Andorra,Austria,2019,4


In [58]:
eu_countries = [
    "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia",
    "Denmark", "Estonia", "Finland", "France", "Germany", "Greece",
    "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
    "Malta", "Netherlands", "Poland", "Portugal", "Romania", "Slovakia",
    "Slovenia", "Spain", "Sweden"
]

In [59]:
pop_cit = pop_cit[
    ~pop_cit["geo"].str.contains("European Union|Total", case=False, na=False) &
    ~pop_cit["citizen"].str.contains("Asia|America|Africa|Europe|Oceanian|Candidate|European Union|Unknown|EU|European|Total", case=False, na=False)
]

In [60]:
def classify_citizen(row):
    if row['citizen'] == 'Reporting country':
        return 'national'
    elif row['citizen'] in eu_countries:
        return 'EU'
    elif row['citizen'] == 'Ukraine':
        return 'from_Ukraine'
    else:
        return 'non_EU'
    
pop_cit['citizen_type'] = pop_cit.apply(classify_citizen, axis=1)

In [61]:
summary = pop_cit.groupby(['geo', 'year', 'citizen_type'])['number of citizens in country'].sum().reset_index()

In [62]:
pivot = summary.pivot_table(
    index=['geo', 'year'],
    columns='citizen_type',
    values='number of citizens in country',
    fill_value=0
).reset_index()

In [65]:
pivot[pivot['geo'] == 'Poland']  # checking the pivot table for Poland

citizen_type,geo,year,EU,from_Ukraine,national,non_EU
230,Poland,2015,0.0,0.0,37891051.0,110001.0
231,Poland,2016,37834800.0,0.0,37811676.0,152122.0
232,Poland,2017,37783928.0,0.0,37756724.0,213118.0
233,Poland,2018,37759287.0,0.0,37731606.0,242205.0
234,Poland,2019,37712047.0,0.0,37683061.0,293009.0
235,Poland,2020,37633096.0,0.0,37599899.0,362239.0
236,Poland,2021,0.0,0.0,36623052.0,448078.0
237,Poland,2022,0.0,0.0,36445251.0,442296.0
238,Poland,2023,0.0,0.0,36314890.0,436595.0
239,Poland,2024,0.0,0.0,36187908.0,430864.0


In [13]:
pivot['foreigners'] = pivot['non_EU'] + pivot['EU']

In [14]:
pivot.head()

citizen_type,geo,year,EU,national,non_EU,foreigners
0,Bulgaria,2015,0.0,6963308.0,63879.0,63879.0
1,Bulgaria,2016,0.0,6863112.0,71491.0,71491.0
2,Bulgaria,2017,0.0,6767835.0,76562.0,76562.0
3,Bulgaria,2018,0.0,6668711.0,82370.0,82370.0
4,Bulgaria,2019,0.0,6565004.0,91243.0,91243.0


In [15]:
def get_iso3(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except LookupError:
        return None

pivot['iso_a3'] = pivot['geo'].apply(get_iso3)

In [16]:
pivot.to_csv('../processed_data/population_by_citizenship_combined_data.csv', index=False)

# MIGRRANTS BY CITIZENSHIP

Uchodźcy wojenni z Ukrainy dla Szwecji i Polski nie są wliczani -> wliczono ich na podstawie zbioru danych UKRAINE TEMPORARY PROTECTION
Osoby ubiegające się o azyl, które regularnie zamieszkują w danym miejscu przez co najmniej 12 miesięcy dla Bułgarii, Węgier, Polski, Rumuni, Szwecji -> wliczono ich na podstawie zbioru danych XYZ

Skondensowana tabela przedstawia dane dotyczące ilości populacji w wybranych krajach w następującym formacie:
* geo - NAZWA KRAJU
* year - ROK
* EU - LICZBA IMIGRANTÓW Z UE
* Ukraine - LICZBA MIGRANTÓW Z UKRAINY
* national - LICZBA IMMIGRANTÓW Z TEGO KRAJU (obywatele powracający z emigracji)
* non_EU - LICZBA IMIGRANTÓW SPOZA UE
* foreigners - LICZBA IMIGRANTÓW OBCOKRAJOWCÓW -> UE + NON UE
* foreigners_pop_share- PROCENT JAKI STANOWIĄ IMMIGRANCI ZAGRANICZNI PRZYJĘCI W DANYM ROKU W CAŁEJ POPULACJI
* non_EU_pop_share - PROCENT JAKI STANOWIĄ IMMIGRANCI SPOZA UE PRZYJĘCI W DANYM ROKU W CAŁEJ POPULACJI
* ukraine_pop_share - PROCENT JAKI STANOWIĄ UKRAIŃSCY IMMIGRANCJI PRZYJĘCI W DANYM ROKU W CAŁEJ POPULACJI
* ukraine_for_share - PROCENT JAKI STANOWIĄ UKRAIŃSCY IMMIGRANCI PRZYJĘCI W DANYM ROKU WŚRÓD WSZYSTKICH OBCOKRAJOWCÓW
* non_EU_for_share - PROCENT JAKI STANOWIĄ IMMIGRANCI SPOZA EU WŚRÓD WSZYSTKICH OBCOKRAJOWCÓW (trzeba uważać z tym, bo dane dotyczące imigrantów z EU wydają się wątpliwe)

In [105]:
migr_cit = pd.read_csv('../processed_data/immigration_by_citizenship.csv')
migr_cit = migr_cit[migr_cit['geo'] != migr_cit['Country of citizenship']]

In [106]:
migr_cit.head()

Unnamed: 0,Country of citizenship,geo,year,Migrants number
0,Andorra,Austria,1998,1.0
1,Andorra,Austria,1999,1.0
2,Andorra,Austria,2001,0.0
3,Andorra,Austria,2002,0.0
4,Andorra,Austria,2003,0.0


In [107]:
eu_countries = [
    "Austria", "Belgium", "Bulgaria", "Croatia", "Cyprus", "Czechia",
    "Denmark", "Estonia", "Finland", "France", "Germany", "Greece",
    "Hungary", "Ireland", "Italy", "Latvia", "Lithuania", "Luxembourg",
    "Malta", "Netherlands", "Poland", "Portugal", "Romania", "Slovakia",
    "Slovenia", "Spain", "Sweden"
]

migr_cit = migr_cit[
    (~migr_cit["geo"].str.contains("Asia|America|Africa|Europe|Oceanian|Candidate|European Union|Unknown|EU|European|Total", case=False, na=False)) &
    (~migr_cit["Country of citizenship"].str.contains("Asia|America|Africa|Europe|Oceanian|Candidate|European Union|Unknown|EU|European|Total", case=False, na=False))
]

In [108]:
def classify_citizen(row):
    if row['Country of citizenship'] == 'Ukraine':
        return 'Ukraine'
    if row['Country of citizenship'] == 'Reporting country':
        return 'national'
    elif row['Country of citizenship'] in eu_countries:
        return 'EU'
    else:
        return 'non_EU'
    
migr_cit.loc[:, 'citizen_type'] = migr_cit.apply(classify_citizen, axis=1)

summary = migr_cit.groupby(['geo', 'year', 'citizen_type'])['Migrants number'].sum().reset_index()

pivot = summary.pivot_table(
    index=['geo', 'year'],
    columns='citizen_type',
    values='Migrants number',
    fill_value=0
).reset_index()

In [109]:
pivot.head()

citizen_type,geo,year,EU,Ukraine,national,non_EU
0,Austria,1998,27468.0,410.0,13494.0,100957.0
1,Austria,1999,30266.0,582.0,14331.0,127645.0
2,Austria,2001,36068.0,777.0,15142.0,133331.0
3,Austria,2002,34008.0,1103.0,21981.0,136829.0
4,Austria,2003,37727.0,1332.0,18528.0,147347.0


In [110]:
pivot['non_EU'] = pivot['non_EU'] + pivot['Ukraine']

In [111]:
pivot['foreigners'] = pivot['non_EU'] + pivot['EU']

In [112]:
pivot[pivot['geo'] == 'France']

citizen_type,geo,year,EU,Ukraine,national,non_EU,foreigners
198,France,2006,0.0,0.0,130649.0,170895.0,170895.0
199,France,2007,0.0,0.0,125746.0,168234.0,168234.0
200,France,2008,0.0,0.0,125377.0,171231.0,171231.0
201,France,2009,0.0,0.0,125268.0,171702.0,171702.0
202,France,2010,0.0,0.0,114241.0,192870.0,192870.0
203,France,2011,0.0,0.0,119654.0,200162.0,200162.0
204,France,2012,0.0,0.0,115752.0,211679.0,211679.0
205,France,2013,80299.0,0.0,120644.0,228557.0,308856.0
206,France,2014,74303.0,0.0,126345.0,223379.0,297682.0
207,France,2015,74515.0,0.0,131260.0,242721.0,317236.0


### Dodanie uchodźców wojennych z Ukrainy do danych o imigracji dla Polski i Szwecji

In [113]:
ukr = pd.read_csv('../processed_data/ukraine_temporary_protection.csv')

In [114]:
selected_countries = ['Poland', 'Sweden']

In [115]:
ukr = ukr[ukr['geo'].isin(selected_countries)]

In [116]:
ukr.head()

Unnamed: 0,Country of citizenship,geo,OBS_VALUE,month
913,Ukraine,Poland,675085.0,2022-03
914,Ukraine,Poland,1046815.0,2022-04
915,Ukraine,Poland,1142375.0,2022-05
916,Ukraine,Poland,1202190.0,2022-06
917,Ukraine,Poland,1258235.0,2022-07


In [117]:
ukr['month'] = pd.to_datetime(ukr['month'], format='%Y-%m')

In [118]:
ukr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 74 entries, 913 to 1057
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Country of citizenship  74 non-null     object        
 1   geo                     74 non-null     object        
 2   OBS_VALUE               74 non-null     float64       
 3   month                   74 non-null     datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 2.9+ KB


In [119]:
ukr_dec = ukr[ukr['month'].dt.month == 12]

In [120]:
ukr_dec['year'] = ukr_dec['month'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ukr_dec['year'] = ukr_dec['month'].dt.year


In [121]:
ukr_dec = ukr_dec[ukr_dec['year']==2022]

In [122]:
df_merged = pd.merge(pivot, ukr_dec, on=['geo', 'year'], how='left')

In [123]:
df_merged.head()

Unnamed: 0,geo,year,EU,Ukraine,national,non_EU,foreigners,Country of citizenship,OBS_VALUE,month
0,Austria,1998,27468.0,410.0,13494.0,101367.0,128835.0,,,NaT
1,Austria,1999,30266.0,582.0,14331.0,128227.0,158493.0,,,NaT
2,Austria,2001,36068.0,777.0,15142.0,134108.0,170176.0,,,NaT
3,Austria,2002,34008.0,1103.0,21981.0,137932.0,171940.0,,,NaT
4,Austria,2003,37727.0,1332.0,18528.0,148679.0,186406.0,,,NaT


In [124]:
df_merged['Ukraine'] = df_merged['Ukraine'].fillna(0) + df_merged['OBS_VALUE'].fillna(0)
df_merged['non_EU'] = df_merged['non_EU'].fillna(0) + df_merged['OBS_VALUE'].fillna(0)
df_merged['foreigners'] = df_merged['foreigners'].fillna(0) + df_merged['OBS_VALUE'].fillna(0)

In [125]:
df_merged = df_merged.drop(columns=['OBS_VALUE'])

In [126]:
df_merged.drop(columns=['month', 'Country of citizenship'], inplace=True)

In [127]:
df_merged[df_merged['geo'] == 'Poland']

Unnamed: 0,geo,year,EU,Ukraine,national,non_EU,foreigners
542,Poland,1999,0.0,0.0,7052.0,473.0,473.0
543,Poland,2000,0.0,0.0,6921.0,410.0,410.0
544,Poland,2001,0.0,0.0,6270.0,355.0,355.0
545,Poland,2002,0.0,0.0,6328.0,259.0,259.0
546,Poland,2003,0.0,0.0,6548.0,500.0,500.0
547,Poland,2004,0.0,0.0,8253.0,1242.0,1242.0
548,Poland,2005,0.0,0.0,8228.0,1136.0,1136.0
549,Poland,2006,367.0,609.0,8978.0,3251.0,3618.0
550,Poland,2007,185.0,678.0,13384.0,3029.0,3214.0
551,Poland,2009,0.0,0.0,142348.0,46730.0,46730.0


### Osoby ubiegające się o azyl 

In [128]:
as_df = pd.read_csv('../processed_data/asylum_applicants.csv')

In [129]:
as_df = as_df[(as_df['geo'].isin(selected_countries)) & (as_df['Country of citizenship'] == 'Total')]

In [130]:
df_merged2 = pd.merge(df_merged, as_df, on=['geo', 'year'], how='left')

In [131]:
df_merged2.head()

Unnamed: 0,geo,year,EU,Ukraine,national,non_EU,foreigners,Country of citizenship,Number
0,Austria,1998,27468.0,410.0,13494.0,101367.0,128835.0,,
1,Austria,1999,30266.0,582.0,14331.0,128227.0,158493.0,,
2,Austria,2001,36068.0,777.0,15142.0,134108.0,170176.0,,
3,Austria,2002,34008.0,1103.0,21981.0,137932.0,171940.0,,
4,Austria,2003,37727.0,1332.0,18528.0,148679.0,186406.0,,


In [132]:
df_merged2['non_EU'] = df_merged2['non_EU'].fillna(0) + df_merged2['Number'].fillna(0)
df_merged2['foreigners'] = df_merged2['foreigners'].fillna(0) + df_merged2['Number'].fillna(0)

In [133]:
df_merged2.drop(columns=['Number', 'Country of citizenship'], inplace=True)

### Jaki procent populacji to przyjęci migranci?

In [46]:
pop_cit = pd.read_csv('../processed_data/country_population.csv')

In [47]:
pop_cit[pop_cit['geo'] == 'Austria']

Unnamed: 0,geo,year,Population number
60,Austria,2003,8100273
61,Austria,2004,8142573
62,Austria,2005,8201359
63,Austria,2006,8254298
64,Austria,2007,8282984
65,Austria,2008,8307989
66,Austria,2009,8335003
67,Austria,2010,8351643
68,Austria,2011,8375164
69,Austria,2012,8408121


In [48]:
merged3 = pd.merge(df_merged2, pop_cit, on=['geo', 'year'], how='left')

In [49]:
merged3.head(20)

Unnamed: 0,geo,year,EU,Ukraine,national,non_EU,foreigners,Population number
0,Austria,1998,27468.0,410.0,13494.0,101367.0,128835.0,
1,Austria,1999,30266.0,582.0,14331.0,128227.0,158493.0,
2,Austria,2001,36068.0,777.0,15142.0,134108.0,170176.0,
3,Austria,2002,34008.0,1103.0,21981.0,137932.0,171940.0,
4,Austria,2003,37727.0,1332.0,18528.0,148679.0,186406.0,8100273.0
5,Austria,2004,45107.0,1390.0,18301.0,163140.0,208247.0,8142573.0
6,Austria,2005,45534.0,1300.0,16470.0,150223.0,195757.0,8201359.0
7,Austria,2006,44632.0,1013.0,15636.0,121176.0,165808.0,8254298.0
8,Austria,2007,38066.0,589.0,8477.0,90818.0,128884.0,8282984.0
9,Austria,2008,39541.0,604.0,8636.0,90810.0,130351.0,8307989.0


In [50]:
merged3['foreigners_pop_share'] = merged3['foreigners'] / merged3['Population number'] * 100
merged3['non_EU_pop_share'] = merged3['non_EU'] / merged3['Population number'] * 100
merged3['ukraine_pop_share'] = merged3['Ukraine'] / merged3['Population number'] * 100
merged3['ukraine_for_share'] = merged3['Ukraine'] / merged3['foreigners'] * 100
merged3['non_EU_for_share'] = merged3['non_EU'] / merged3['foreigners'] * 100
merged3.drop(columns=['Population number'], inplace=True)

In [51]:
merged3[merged3['geo']=='Austria']

Unnamed: 0,geo,year,EU,Ukraine,national,non_EU,foreigners,foreigners_pop_share,non_EU_pop_share,ukraine_pop_share,ukraine_for_share,non_EU_for_share
0,Austria,1998,27468.0,410.0,13494.0,101367.0,128835.0,,,,0.318237,78.679707
1,Austria,1999,30266.0,582.0,14331.0,128227.0,158493.0,,,,0.367209,80.903888
2,Austria,2001,36068.0,777.0,15142.0,134108.0,170176.0,,,,0.456586,78.805472
3,Austria,2002,34008.0,1103.0,21981.0,137932.0,171940.0,,,,0.641503,80.221007
4,Austria,2003,37727.0,1332.0,18528.0,148679.0,186406.0,2.301231,1.835481,0.016444,0.714569,79.760845
5,Austria,2004,45107.0,1390.0,18301.0,163140.0,208247.0,2.557509,2.003544,0.017071,0.667477,78.339664
6,Austria,2005,45534.0,1300.0,16470.0,150223.0,195757.0,2.386885,1.831684,0.015851,0.664089,76.739529
7,Austria,2006,44632.0,1013.0,15636.0,121176.0,165808.0,2.008747,1.468035,0.012272,0.610948,73.082119
8,Austria,2007,38066.0,589.0,8477.0,90818.0,128884.0,1.556009,1.096441,0.007111,0.457,70.464914
9,Austria,2008,39541.0,604.0,8636.0,90810.0,130351.0,1.568984,1.093044,0.00727,0.463364,69.665749


In [52]:
def get_iso3(country_name):
    try:
        return pycountry.countries.lookup(country_name).alpha_3
    except LookupError:
        return None

merged3['iso_a3'] = merged3['geo'].apply(get_iso3)

In [53]:
merged3.to_csv('../processed_data/immigration_by_citizenship_combined_data.csv', index=False)

In [54]:
merged3[merged3['geo']=='Poland']

Unnamed: 0,geo,year,EU,Ukraine,national,non_EU,foreigners,foreigners_pop_share,non_EU_pop_share,ukraine_pop_share,ukraine_for_share,non_EU_for_share,iso_a3
542,Poland,1999,0.0,0.0,7052.0,473.0,473.0,,,,0.0,100.0,POL
543,Poland,2000,0.0,0.0,6921.0,410.0,410.0,,,,0.0,100.0,POL
544,Poland,2001,0.0,0.0,6270.0,355.0,355.0,,,,0.0,100.0,POL
545,Poland,2002,0.0,0.0,6328.0,259.0,259.0,,,,0.0,100.0,POL
546,Poland,2003,0.0,0.0,6548.0,500.0,500.0,0.001308,0.001308,0.0,0.0,100.0,POL
547,Poland,2004,0.0,0.0,8253.0,1242.0,1242.0,0.003252,0.003252,0.0,0.0,100.0,POL
548,Poland,2005,0.0,0.0,8228.0,1136.0,1136.0,0.002976,0.002976,0.0,0.0,100.0,POL
549,Poland,2006,367.0,609.0,8978.0,3251.0,3618.0,0.009482,0.00852,0.001596,16.832504,89.856274,POL
550,Poland,2007,185.0,678.0,13384.0,3029.0,3214.0,0.00843,0.007945,0.001778,21.095208,94.243933,POL
551,Poland,2009,0.0,0.0,142348.0,57325.0,57325.0,0.150318,0.150318,0.0,0.0,100.0,POL
