In [1]:
# Initialisations
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../Data/unhcr_query2.csv", low_memory = False, header = 14) # UNHCR query data
df.head()

Unnamed: 0,Country / territory of asylum/residence,Origin,Year,Month,Value
0,Australia,Afghanistan,1999,January,8
1,Australia,Afghanistan,1999,February,10
2,Australia,Afghanistan,1999,March,25
3,Australia,Afghanistan,1999,April,25
4,Australia,Afghanistan,1999,May,7


In [4]:
df.shape

(332189, 5)

In [5]:
# Rename and reshape for ease of analysis and relevance
df = df.rename(columns ={'Country / territory of asylum/residence':'Destination'})
df = df.drop(['Month'], axis = 1)
df['Value'] = pd.to_numeric(df['Value'], errors='coerce')
df.head()

Unnamed: 0,Destination,Origin,Year,Value
0,Australia,Afghanistan,1999,8.0
1,Australia,Afghanistan,1999,10.0
2,Australia,Afghanistan,1999,25.0
3,Australia,Afghanistan,1999,25.0
4,Australia,Afghanistan,1999,7.0


In [7]:
df2 = pd.read_csv("../Data/country_centroids_all.csv", sep = '\t')
df2 = df2[['LAT', 'LONG','SHORT_NAME']]
df2 = df2.rename(columns ={'SHORT_NAME':'Destination', 'LAT':'dest_lat', 'LONG':'dest_lon'})
df2.head()

Unnamed: 0,dest_lat,dest_lon,Destination
0,33.0,66.0,Afghanistan
1,41.0,20.0,Albania
2,28.0,3.0,Algeria
3,-14.333333,-170.0,American Samoa
4,42.5,1.5,Andorra


In [8]:
df2.head() 
df2.to_csv("../Data/country_centroids_use.csv", index = False)

In [9]:
# Palestine and Tibet do not exist in the GNS as the US Government does not
# acknowledge them as countries. However, they are present in the UNHCR data
# so it is important to acknowledge them in this context. 
df2 = df2.replace('West Bank','Palestine')
df2.loc[len(df2.index)] = [29.6472, 91.1174, 'Tibet'] 

In [10]:
df3 = df2.copy()
df3 = df3.rename(columns ={'Destination':'Origin', 'dest_lat':'origin_lat', 'dest_lon':'origin_lon'})
df3['Origin'].unique() # Find confusing country names that need simplification

array(['Afghanistan', 'Albania', 'Algeria', 'American Samoa', 'Andorra',
       'Angola', 'Anguilla', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Ascension', 'Ashmore and Cartier Islands',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Bassas da India', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia', 'Bonaire',
       'Bosnia and Herzegovina', 'Botswana', 'Bouvet Island', 'Brazil',
       'British Indian Ocean Territory', 'British Virgin Islands',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi',
       'Cambodia', 'Cameroon', 'Canada', 'Cape Verde', 'Cayman Islands',
       'Central African Republic', 'Chad', 'Chile', 'China',
       'Christmas Island', 'Clipperton Island', 'Cocos (Keeling) Islands',
       'Colombia', 'Comoros', 'Cook Islands', 'Coral Sea Islands',
       'Costa Rica', "Cote d'Ivoire", 'Croatia', 'Cuba', 'Curaçao',
       'Cyprus', 'Czech Republic'

In [11]:
# Create a dictionary to simplify long country names so that the user entry 
# portion of the program is easier and more intuitive to use

dic = pd.Series(index = df.Destination.unique()).to_dict()
dic2 = pd.Series(index = df.Origin.unique()).to_dict()
dic.update(dic2)

for key, value in dic.items():
    if (key in df2.Destination.values):
        dic[key] = key
        
dic['Bolivia (Plurinational State of)'] = 'Bolivia'
dic['Brunei Darussalam'] ='Brunei'
dic['Cabo Verde'] = 'Cape Verde'
dic['Central African Rep.'] = 'Central African Republic'
dic['China, Hong Kong SAR'] = 'Hong Kong'
dic['China, Macao SAR'] = 'Macau'
dic['Congo'] = 'Democratic Republic of the Congo'
dic['Czech Rep.'] = 'Czech Republic'
dic["C\xc3\xb4te d'Ivoire"] = "Cote d'Ivoire"
dic["Dem. People's Rep. of Korea"] = 'North Korea'
dic['Dem. Rep. of the Congo'] = 'Democratic Republic of the Congo'
dic['Dominican Rep.'] = 'Dominican Republic'
dic['Iran (Islamic Rep. of)'] = 'Iran'
dic["Lao People's Dem. Rep."] = 'Laos'
dic['Micronesia (Federated States of)'] = 'Federated States of Micronesia'
dic['Myanmar'] = 'Burma'
dic['Palestinian'] = 'Palestine'
dic['Rep. of Korea'] = 'South Korea'
dic['Rep. of Moldova'] = 'Moldova'
dic['Russian Federation'] = 'Russia'
dic['Serbia and Kosovo: S/RES/1244 (1999)'] = 'Serbia'
dic['Stateless'] = 'Stateless'
dic['Syrian Arab Rep.'] = 'Syria'
dic['The former Yugoslav Rep. of Macedonia'] = 'Macedonia'
dic['Tibetan'] = 'Tibet'
dic['USA (EOIR)'] = 'United States'
dic['USA (INS/DHS)'] = 'United States'
dic['United Kingdom of Great Britain and Northern Ireland'] = 'United Kingdom'
dic['United Rep. of Tanzania'] = 'Tanzania'
dic['United States of America'] = 'United States'
dic['Various/unknown'] = 'Various/unknown'
dic['Venezuela (Bolivarian Republic of)'] = 'Venezuela'
dic['Viet Nam'] = 'Vietnam'

  dic = pd.Series(index = df.Destination.unique()).to_dict()
  dic2 = pd.Series(index = df.Origin.unique()).to_dict()


In [12]:
# Pass the Destination and Origin columns through the dictionary to apply the changes
df['Destination'] = df['Destination'].replace(dic)
df['Origin'] = df['Origin'].replace(dic)
df.head()

Unnamed: 0,Destination,Origin,Year,Value
0,Australia,Afghanistan,1999,8.0
1,Australia,Afghanistan,1999,10.0
2,Australia,Afghanistan,1999,25.0
3,Australia,Afghanistan,1999,25.0
4,Australia,Afghanistan,1999,7.0


In [13]:
df3.head()

Unnamed: 0,origin_lat,origin_lon,Origin
0,33.0,66.0,Afghanistan
1,41.0,20.0,Albania
2,28.0,3.0,Algeria
3,-14.333333,-170.0,American Samoa
4,42.5,1.5,Andorra


In [14]:
# These cannot be plotted on a journey map so need to be removed
df = df[df.Destination != 'Stateless']
df = df[df.Origin != 'Stateless']

df = df[df.Destination != 'Various/unknown']
df = df[df.Origin != 'Various/unknown']

df = df[df.Origin != df.Destination]

df.head()

Unnamed: 0,Destination,Origin,Year,Value
0,Australia,Afghanistan,1999,8.0
1,Australia,Afghanistan,1999,10.0
2,Australia,Afghanistan,1999,25.0
3,Australia,Afghanistan,1999,25.0
4,Australia,Afghanistan,1999,7.0


In [15]:
# Merge the datasets to create a new dataset describing journeys

df = df.merge(df2, how='inner')
df = df.merge(df3, how='inner')

df = df.dropna(subset=['dest_lat','dest_lon', 'origin_lat', 'origin_lon'])
df.head()

Unnamed: 0,Destination,Origin,Year,Value,dest_lat,dest_lon,origin_lat,origin_lon
0,Australia,Afghanistan,1999,8.0,-25.0,135.0,33.0,66.0
1,Australia,Afghanistan,1999,10.0,-25.0,135.0,33.0,66.0
2,Australia,Afghanistan,1999,25.0,-25.0,135.0,33.0,66.0
3,Australia,Afghanistan,1999,25.0,-25.0,135.0,33.0,66.0
4,Australia,Afghanistan,1999,7.0,-25.0,135.0,33.0,66.0


In [16]:
# Group rows so that journeys in the same year, from the same location and to the same location are shown together
# They need to be grouped this way so that the line density on the geospatial connection map is of a representative opacity
df = pd.DataFrame(df.groupby(['Year', 'Origin','origin_lat', 'origin_lon', 'Destination', 'dest_lat', 'dest_lon'])['Value'].sum())
df = df.reset_index()
df.head()

Unnamed: 0,Year,Origin,origin_lat,origin_lon,Destination,dest_lat,dest_lon,Value
0,1999,Afghanistan,33.0,66.0,Australia,-25.0,135.0,609.0
1,1999,Afghanistan,33.0,66.0,Austria,47.333333,13.333333,2209.0
2,1999,Afghanistan,33.0,66.0,Belgium,50.833333,4.0,401.0
3,1999,Afghanistan,33.0,66.0,Bulgaria,43.0,25.0,277.0
4,1999,Afghanistan,33.0,66.0,Canada,60.0,-96.0,539.0


In [17]:
# Read the processed data out to a csv to be analysed and visualised elsewhere
df.to_csv("../Data/migratory_patterns_compiled.csv", index = False)