In [526]:
import pandas as pd
import seaborn as sns
import numpy as np
import collections
import matplotlib.pyplot as plt

In [527]:
name_recodes = {
    "USA (EOIR)":"United States",
    "USA (INS/DHS)":"United States",
    "Venezuela, Bolivarian Republic of":"Venezuela",
    "Venezuela (Bolivarian Republic of)":"Venezuela"
}

country_codes = pd.read_csv("country_codes.csv")
country_codes['Name'] = country_codes['Name'].replace(name_recodes)

In [528]:
####### UNHCR

unhcr_raw = pd.read_csv("unhcr_popstats_export_asylum_seekers_monthly_all_data.csv", skiprows=3)

# replacing names
unhcr_raw['Origin'] = unhcr_raw['Origin'].replace(name_recodes)
unhcr_raw['Country / territory of asylum/residence'] = unhcr_raw['Country / territory of asylum/residence'].replace(name_recodes)

unhcr_temp_1 = pd.merge(unhcr_raw, country_codes, left_on="Country / territory of asylum/residence", right_on="Name", how="left")
unhcr_temp_2 = pd.merge(unhcr_temp_1, country_codes, left_on='Origin', right_on='Name', suffixes=('_destination','_origin'), how="left")
unhcr_temp_3 = unhcr_temp_2[['Code_destination','Code_origin','Year','Month','Value','Name_destination','Name_origin']].drop_duplicates()

# since 2014
unhcr = unhcr_temp_3[(unhcr_temp_3['Year'] >= 2014) & (unhcr_temp_3['Year'] <= 2017)]

# weekly
unhcr['date_recode'] = pd.to_datetime(unhcr['Month'] + ' ' + unhcr['Year'].astype(str))
unhcr['Value'] = pd.to_numeric(unhcr['Value'], errors='coerce')
unhcr = unhcr.groupby(['Name_origin','Name_destination','Code_destination','Code_origin','date_recode']).aggregate({"Value":"sum"}).reset_index()


# countries after 2014 to csv
unhcr_temp_2_out = unhcr_temp_2[unhcr_temp_2['Year'] >= 2014][['Origin','Code_origin']].drop_duplicates()
unhcr_temp_2_out.to_csv('origin_codes.csv', index=False)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [572]:
# insheet google trends
gtrends_raw = pd.read_csv("gtrends_all.csv")
kw_list = list(gtrends_raw['word'].unique())
renames_ = {str(kw):kw.replace("+","_") for kw in kw_list} # can't have +s in variable names
new_kw_list = list(renames_.values())
aggregation = {kw:"mean" for kw in new_kw_list}
gtrends = gtrends_raw.rename(columns=renames_)
gtrends['date_recode'] = pd.to_datetime(pd.to_datetime(gtrends['index']).dt.strftime('%Y-%m-01'))
gtrends_monthly = gtrends.groupby(['Country Code','date_recode']).aggregate(aggregation).reset_index()

In [573]:
##### GTRENDS + UNHCR

refugees_raw = pd.merge(gtrends_monthly, unhcr, left_on=['date_recode','Country Code'], right_on=['date_recode','Code_origin'])
refugees = refugees_raw[((refugees_raw['Code_destination'] == "ES") | (refugees_raw['Code_destination'] == "US"))][['Value','date_recode','Code_origin','Code_destination','Name_origin','Name_destination']+new_kw_list]
refugees['Value'] = pd.to_numeric(refugees['Value'], errors='coerce')

# fill NAs with 0 in keyword columns
refugees = refugees.fillna(0)


In [574]:
refugees = refugees.drop_duplicates().sort_values(['Code_origin','Code_destination','date_recode']).dropna(subset=['Value'])
refugees.to_csv("refugees.csv", index=False)


In [577]:
# first keep only pairs where we have full years of data
refugees['year'] = refugees['date_recode'].dt.year
refugees['num_in_year'] = refugees.groupby(['Code_origin','Code_destination','year'])['date_recode'].transform('count')
refugees_fullyears = refugees[refugees.num_in_year == 12]
refugees_fullyears = refugees_fullyears.drop_duplicates().sort_values(['Code_origin','Code_destination','date_recode']).dropna(subset=['Value'])

refugees_wlags = refugees_fullyears
refugees_wlags['Value_lag1'] = refugees_wlags.groupby(['Code_origin','Code_destination']).Value.shift(1)
refugees_wlags['Value_lag2'] = refugees_wlags.groupby(['Code_origin','Code_destination']).Value.shift(2)
refugees_wlags['Value_diff'] = refugees_wlags['Value'] - refugees_wlags['Value_lag1']

# creating some new variables
# creating all two_way interactions in a loop
new_kw_list_w_interactions = new_kw_list.copy()
for kw in new_kw_list:
    for kw2 in [x for x in new_kw_list if x != new_kw_list]:
        refugees_wlags[kw +'_x_'+kw2] = refugees_wlags[kw]*refugees_wlags[kw2]
        new_kw_list_w_interactions.append("{}_x_{}".format(kw,kw2))

for kw in new_kw_list_w_interactions:
    refugees_wlags[kw+'_lag1'] = refugees_wlags.groupby(['Code_origin','Code_destination'])[kw].shift(1)
    refugees_wlags[kw+'_lag2'] = refugees_wlags.groupby(['Code_origin','Code_destination'])[kw].shift(2)
    refugees_wlags[kw+'_diff'] = refugees_wlags[kw] - refugees_wlags[kw+'_lag1']
    refugees_wlags[kw+'_diff_lag1'] = refugees_wlags[kw+'_lag1'] - refugees_wlags[kw+'_lag2']

    
# differences here: we want to predict next month's change in asylum applications by searches this month


refugees_wlags.to_csv('refugees_wlags.csv', index=False)
    

In [580]:
#visualizing gtrends by origin country -- in this case just "inmigrar+EEUU"
plt.figure(figsize=(15,10))
sns.set_context("paper",font_scale=1.2)
sns.lineplot(x="date_recode", y="inmigrar_EEUU", hue="Country Code", data=gtrends_monthly).set_title("Monthly Searches by Country")



ValueError: Could not interpret input 'inmigrar_EEUU'

<Figure size 1080x720 with 0 Axes>

In [582]:
print(",".join(new_kw_list_w_interactions))

llegada,asilo,ciudadania,consulado,aduana,deportacion,diaspora,embajada,emigrante,emigrar,emigracion,extranjero,ilegal,inmigrante,inmigrar,inmigracion,legalizacion,migrante,migrar,migracion,nacionalidad,naturalizacion,pasaporte,cuota,refugiado,traficante,turista,visa,EEUU,españa,solicitante,indocumentado,llegada_x_llegada,llegada_x_asilo,llegada_x_ciudadania,llegada_x_consulado,llegada_x_aduana,llegada_x_deportacion,llegada_x_diaspora,llegada_x_embajada,llegada_x_emigrante,llegada_x_emigrar,llegada_x_emigracion,llegada_x_extranjero,llegada_x_ilegal,llegada_x_inmigrante,llegada_x_inmigrar,llegada_x_inmigracion,llegada_x_legalizacion,llegada_x_migrante,llegada_x_migrar,llegada_x_migracion,llegada_x_nacionalidad,llegada_x_naturalizacion,llegada_x_pasaporte,llegada_x_cuota,llegada_x_refugiado,llegada_x_traficante,llegada_x_turista,llegada_x_visa,llegada_x_EEUU,llegada_x_españa,llegada_x_solicitante,llegada_x_indocumentado,asilo_x_llegada,asilo_x_asilo,asilo_x_ciudadania,asilo_x_consulado,a

In [583]:
len(new_kw_list_w_interactions)

1056

In [585]:
len(refugees_wlags)

564