In [1]:
import pandas as pd
df = pd.read_csv('../data/bronze/european_visa_database/visa_practice_eu.csv')
df

Unnamed: 0,visaPracticeEuID,rcID,receivingCountryName,receivengCountryCode,scCityID,sendingCountryName,sendingCountryCode,sendingCityName,dYear,shortStayAppliedFor,...,issuedABCDVTL,issuedABCDDCVTL,appliedA,appliedB,appliedC,appliedABC,notIssuedA,notIssuedB,notIssuedC,notIssuedABC
0,31797,173,Belgium,BE,7,Algeria,DZ,Algiers,2005,5361.0,...,,4372.0,,,5070.0,5075.0,,,,1340.0
1,31798,173,Belgium,BE,11,Angola,AO,Luanda,2005,1910.0,...,,1140.0,,,1699.0,1910.0,,,,144.0
2,31799,173,Belgium,BE,13,Argentina,AR,Buenos Aires,2005,17.0,...,,75.0,,,17.0,17.0,,,,4.0
3,31800,173,Belgium,BE,21,Australia,AU,Canberra,2005,387.0,...,,253.0,,,274.0,387.0,,,,273.0
4,31801,173,Belgium,BE,281,Austria,AT,Vienna,2005,2.0,...,,9.0,,,2.0,2.0,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19013,50810,185,Italy,IT,265,Zimbabwe,ZW,Harare,2012,669.0,...,,,0.0,,665.0,665.0,0.0,,0.0,
19014,50811,190,Netherlands,NL,265,Zimbabwe,ZW,Harare,2012,391.0,...,,,,,391.0,391.0,,,,
19015,50812,170,Norway,NO,265,Zimbabwe,ZW,Harare,2012,596.0,...,,,0.0,,524.0,524.0,0.0,,40.0,40.0
19016,50813,192,Portugal,PT,265,Zimbabwe,ZW,Harare,2012,136.0,...,,,,,136.0,136.0,,,,


In [3]:
df.columns

Index(['visaPracticeEuID', 'rcID', 'receivingCountryName',
       'receivengCountryCode', 'scCityID', 'sendingCountryName',
       'sendingCountryCode', 'sendingCityName', 'dYear', 'shortStayAppliedFor',
       'shortStayIssued', 'shortStayRefused', 'shortStayRefusalRate',
       'issuedA_All', 'issuedA_Mev', 'issuedB', 'issuedC_All', 'issuedC_Mev',
       'issuedD', 'issuedDC', 'issuedVTL', 'issuedADS', 'issuedABC',
       'issuedABCVTL', 'issuedABCDVTL', 'issuedABCDDCVTL', 'appliedA',
       'appliedB', 'appliedC', 'appliedABC', 'notIssuedA', 'notIssuedB',
       'notIssuedC', 'notIssuedABC'],
      dtype='object')

Clean-up raw Schengen input file and save as silver statistics 

In [5]:
import numpy as np

column_mapping = {"receivingCountryName": "schengen_state"
                    ,"sendingCountryName": "origin_country"
                    ,"sendingCityName": "origin_consulate"
                    ,"appliedC": "visas_applied"
                    ,"issuedC_All": "visas_issued"
                    ,"notIssuedC": "visas_not_issued"
                    ,"dYear": "year"}

df = df.rename(columns = column_mapping)
print('Renamed columns')

df = df[["schengen_state", "origin_country","origin_consulate","visas_applied", "visas_issued", "visas_not_issued"]]
print('Selected columns')

df["visas_applied"].replace(np.nan, 0, inplace=True)
df["visas_issued"].replace(np.nan, 0, inplace=True)
df["visas_not_issued"].replace(np.nan, 0, inplace=True)
print('Replaced applied, issued and not issued null values with 0')

df["visas_issued"].loc[df["visas_issued"]<0] = 0
print('Replaced negative entries for visas issued with 0')

df["visa_refusal_rate"] = df["visas_not_issued"] / (df["visas_issued"] + df["visas_not_issued"])
print('Calculate refusal rate as the not issued share of the total issued and not issued')

df.to_csv('../data/silver/schengen-visa-evd.csv')

Renamed columns
Selected columns
Replaced applied, issued and not issued null values with 0
Replaced negative entries for visas issued with 0
Calculate refusal rate as the not issued share of the total issued and not issued


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["visas_issued"].loc[df["visas_issued"]<0] = 0


Data quality checks

In [6]:
df[df["visas_applied"] != df["visas_issued"] + df["visas_not_issued"]]

Unnamed: 0,schengen_state,origin_country,origin_consulate,visas_applied,visas_issued,visas_not_issued,visa_refusal_rate
0,Belgium,Algeria,Algiers,5070.0,3735.0,0.0,0.000000
1,Belgium,Angola,Luanda,1699.0,944.0,0.0,0.000000
2,Belgium,Argentina,Buenos Aires,17.0,13.0,0.0,0.000000
3,Belgium,Australia,Canberra,274.0,113.0,0.0,0.000000
4,Belgium,Austria,Vienna,2.0,3.0,0.0,0.000000
...,...,...,...,...,...,...,...
19002,Spain,Yemen,Sana 'a,684.0,627.0,1.0,0.001592
19003,Denmark,Zambia,Lusaka,489.0,256.0,6.0,0.022901
19008,Netherlands,Zambia,Lusaka,251.0,243.0,7.0,0.028000
19009,Sweden,Zambia,Lusaka,989.0,929.0,18.0,0.019007


In [7]:
df[df["visa_refusal_rate"]>1]

Unnamed: 0,schengen_state,origin_country,origin_consulate,visas_applied,visas_issued,visas_not_issued,visa_refusal_rate


In [8]:
df[df["visa_refusal_rate"].isna()]

Unnamed: 0,schengen_state,origin_country,origin_consulate,visas_applied,visas_issued,visas_not_issued,visa_refusal_rate
6,Belgium,Bolivia,La Paz,0.0,0.0,0.0,
51,Belgium,Luxembourg,Luxemburg,0.0,0.0,0.0,
97,Denmark,Austria,Vienna,1.0,0.0,0.0,
99,Denmark,Belgium,Brussels,1.0,0.0,0.0,
115,Denmark,Macedonia,Skopje,1.0,0.0,0.0,
...,...,...,...,...,...,...,...
18357,Sweden,Portugal,Lisbon,3.0,0.0,0.0,
18450,Netherlands,Rwanda,Kigali,1.0,0.0,0.0,
18517,Sweden,Singapore,Singapore,14.0,0.0,0.0,
18572,Sweden,Spain,Madrid,1.0,0.0,0.0,


In [9]:
df.describe()

Unnamed: 0,visas_applied,visas_issued,visas_not_issued,visa_refusal_rate
count,19018.0,19018.0,19018.0,18230.0
mean,5118.433,4941.748,78.019613,0.0239
std,25297.66,24775.14,628.178803,0.091372
min,0.0,0.0,0.0,0.0
25%,29.0,41.0,0.0,0.0
50%,392.0,396.0,0.0,0.0
75%,2348.75,2163.5,0.0,0.0
max,1030968.0,1022443.0,28329.0,1.0
