In [1]:
import numpy as np
import pandas as pd

### Load and clean UNHCR data

In [3]:
# Assumes the raw file is in the same directory as the notebook
data = pd.read_csv("unhcr_popstats_export_asylum_seekers_all_data(raw_data).csv",skiprows=3)

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.columns

Index(['Year', 'Country / territory of asylum/residence', 'Origin',
       'RSD procedure type / level', 'Tota pending start-year',
       'of which UNHCR-assisted', 'Applied during year',
       'statistics.filter.decisions_recognized',
       'statistics.filter.decisions_other', 'Rejected', 'Otherwise closed',
       'Total decisions', 'Total pending end-year',
       'of which UNHCR-assisted.1'],
      dtype='object')

In [5]:
# No data before 2000
print("Earliest year of data: {}".format(data['Year'].min()))

Earliest year of data: 2000


In [6]:
# Annoyingly, the data uses both NaN and * for missing values.  This convert * to NaN which are then dropped.
data['Applied during year'].loc[data['Applied during year']=='*']=np.NaN

# Number of missing values in "Applied during year"
num_missing_values = sum(data['Applied during year'].isna())
print("Number of missing values in 'Applied during year': {}".format(num_missing_values))
print("Fraction of missing values in 'Applied during year': {}".format(num_missing_values/data.shape[0]))

Number of missing values in 'Applied during year': 11875
Fraction of missing values in 'Applied during year': 0.07870493107104984


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [9]:
data[data['Applied during year'].isna()]['Origin'].value_counts()

Cameroon                            168
Congo                               166
Nigeria                             153
Guinea                              153
Ethiopia                            149
Sierra Leone                        148
Liberia                             148
Mali                                148
Côte d'Ivoire                       145
Eritrea                             144
Palestinian                         140
Sudan                               137
Lebanon                             135
Dem. Rep. of the Congo              133
Syrian Arab Rep.                    132
Ghana                               131
Egypt                               131
Togo                                131
Jordan                              126
Somalia                             126
Senegal                             121
India                               120
Gambia                              119
Central African Rep.                117
Afghanistan                         117


In [6]:
# For now, just remove the missing value rows
filtered_data = data.dropna(subset=['Applied during year'])
filtered_data = filtered_data.astype({'Applied during year': 'int32'})

In [7]:
# Group the data by year and origin, sum the values of 'Applied during year'
grouped_data = filtered_data.groupby(['Year','Origin'])['Applied during year'].sum()

In [8]:
# Check that totals line up
print("Grouped sum: {}".format(grouped_data.sum()))
print("Raw sum: {}".format(filtered_data['Applied during year'].sum()))

Grouped sum: 22615322
Raw sum: 22615322


In [20]:
# Convert to a DataFrame so we can add columns
target_df = pd.DataFrame(grouped_data)
target_df['year_origin'] = output_df.index

### Join with ISO codes

In [10]:
# Load ISO code csv
iso = pd.read_csv("ISO_codes.csv")
iso.head()

Unnamed: 0,English_short_name,French_short_name,Alpha_2_code,Alpha_3_code,Numeric,Origin
0,Afghanistan,Afghanistan (l'),AF,AFG,4,Afghanistan
1,Albania,Albanie (l'),AL,ALB,8,Albania
2,Algeria,Algérie (l'),DZ,DZA,12,Algeria
3,American Samoa,Samoa américaines (les),AS,ASM,16,American Samoa
4,Andorra,Andorre (l'),AD,AND,20,Andorra


In [23]:
output_df = pd.merge(target_df,iso,how='left',on='Origin').set_index(output_df.index)
output_df = output_df.drop(['Origin'],axis=1)
output_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Applied during year,year_origin,English_short_name,French_short_name,Alpha_2_code,Alpha_3_code,Numeric
Year,Origin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000,Afghanistan,291283,"(2000, Afghanistan)",Afghanistan,Afghanistan (l'),AF,AFG,4.0
2000,Albania,9765,"(2000, Albania)",Albania,Albanie (l'),AL,ALB,8.0
2000,Algeria,10312,"(2000, Algeria)",Algeria,Algérie (l'),DZ,DZA,12.0
2000,Andorra,1,"(2000, Andorra)",Andorra,Andorre (l'),AD,AND,20.0
2000,Angola,7160,"(2000, Angola)",Angola,Angola (l'),AO,AGO,24.0


In [24]:
# Create csv
output_df.to_csv("Asylum_seekers_per_country_per_year.csv")