In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os import listdir
import geopy.distance
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score


In [2]:
conflict_df = pd.read_csv('../data/conflict_data_irq.csv')
conflict_df.drop(0, inplace=True)
conflict_df['date_start'] = pd.to_datetime(conflict_df['date_start'])
conflict_df['date_end'] = pd.to_datetime(conflict_df['date_start'])

In [3]:
conflict_df.replace(to_replace={'Al Anbār province':'Anbar',
                            'Nīnawá province':'Ninewa',
                           'Baghdād province':'Baghdad', 
                            'Dahūk province':'Dahuk', 
                          'Diyālá province':'Diyala',
                           'Kirkūk province':'Kirkuk',
                           'Şalāḩ ad Dīn province':'Salah al-Din'}, inplace=True)

In [4]:
conflict_df.replace(to_replace={'Abū Ghurayb district':'Abu Ghraib',
'Al Ba‘āj district':"Al-Ba'aj",
'Al Qā’im district':"Al-Ka'im",
'Al-Faris district':'Al-Fares',
'Hīt district':'Heet',
'Qaḑā’ ‘Ānah':'Ana',
'Qaḑā’ ad Dawr':'Al-Daur',
'Qaḑā’ al Fallūjah':'Falluja',
'Qaḑā’ al Ḩaḑr':'Hatra',
'Qaḑā’ al Ḩamdānīyah':'Al-Hamdaniya',
'Qaḑā’ al Khāliş':'Al-Khalis',
'Qaḑā’ al Maḩmūdīyah':'Mahmoudiya',
'Qaḑā’ al Mawşil':'Mosul',
'Qaḑā’ al Miqdādīyah':'Al-Muqdadiya',
'Qaḑā’ ar Ramādī':'Ramadi',
'Qaḑā’ ar Ruţbah':'Al-Rutba',
'Qaḑā’ ash Shaykhān':'Al-Shikhan',
'Qaḑā’ Balad':'Balad',
'Qaḑā’ Bayjī':'Baiji',
'Qaḑā’ Haditha':'Haditha',
'Qaḑā’ Khānaqīn':'Khanaqin',
'Qaḑā’ Kifrī':'Kifri',
'Qaḑā’ Sāmarrā':'Samarra',
'Qaḑā’ Sharqāţ':'Al-Shirqat',
'Qaḑā’ Tall ‘Afar':'Telafar',
'Qaḑā’ Tikrīt':'Tikrit',
'Qaḑā’ Zākhū':'Zakho',
'Sinjār district':'Sinjar',
'Tallkayf district':'Tilkaif',
'Tooz district':'Tuz Khurmatu',
'Zakho district':'Zakho'}, inplace=True)

In [5]:
gov_names = ['Anbar', 'Ninewa', 'Baghdad', 'Dahuk', 'Diyala', 'Salah al-Din']

In [6]:
district_names = ['Abu Ghraib', "Al-Ba'aj", "Al-Ka'im", 'Al-Fares', 'Heet', 'Ana', 'Al-Daur',
'Falluja', 'Hatra', 'Al-Hamdaniya', 'Al-Khalis', 'Mahmoudiya', 'Mosul', 'Al-Muqdadiya', 'Ramadi',
'Al-Rutba', 'Al-Shikhan', 'Balad', 'Baiji', 'Haditha', 'Khanaqin', 'Kifri', 'Samarra', 
'Al-Shirqat', 'Telafar', 'Tikrit', 'Zakho', 'Sinjar', 'Tilkaif', 'Tuz Khurmatu','Zakho']

In [7]:
conflict_df = conflict_df[conflict_df['adm_1'].isin(gov_names)]
conflict_df = conflict_df[conflict_df['adm_2'].isin(district_names)]

In [8]:
%%time
#reads in data on outflow of refugees, concatenates into big dataframe
outflow_filepaths = [s for s in listdir("../data/out/")]
out_df = pd.concat((pd.read_csv("../data/out/"+s) for s in outflow_filepaths), ignore_index=True) 


CPU times: user 3.44 s, sys: 762 ms, total: 4.2 s
Wall time: 5.01 s


In [9]:
out_df.columns

Index(['Unnamed: 0', 'Location ID', 'Place ID', 'Governorate', 'District',
       'Location Name', 'Arabic Name', 'Latitude', 'Longitude', 'Families',
       'Individuals', 'Anbar', 'Babylon', 'Baghdad', 'Basrah', 'Dahuk',
       'Diyala', 'Erbil', 'Kerbala', 'Kirkuk', 'Missan', 'Muthanna', 'Najaf',
       'Ninewa', 'Qadissiya', 'Salahal Din', 'Sulaymaniyah', 'Thi Qar',
       'Wassit', 'Camp', 'Hostfamilies', 'Hotel Motel', 'Informalsettlements',
       'Own Property', 'Other', 'Religiousbuilding', 'Rented pre Apr 2019',
       'Rented Habitable', 'Rented Uninhabitable', 'Schoolbuilding',
       'Unfinishedbuilding', 'Unknownsheltertype',
       'Pre June14 Period of displacement',
       'June July14 Period of displacement', 'August14 Period of displacement',
       'Post September 14 Period of displacement',
       'Post April15 Period of displacement',
       'Post March 16 Period of displacement',
       'Post 17 October 16 Period of displacement',
       'July 17 Period of displa

In [10]:
out_df.dropna(how='any', inplace=True)

In [11]:
#cleanup of data types 
out_df['date'] = pd.to_datetime(out_df['date'])
out_df.rename(columns={'Location Name':'Location_name'}, inplace=True)
out_df['Location ID'] = out_df['Location ID'].astype(int)
out_df['Place ID'] = out_df['Place ID'].astype(int)
out_df['Families'] = out_df['Families'].astype(int)
out_df['Individuals'] = out_df['Individuals'].astype(int)


In [12]:
#filters only for districts on which there exists return data 
out_df = out_df[out_df['District'].isin(district_names)]

In [36]:
%%time
#reads in data on returning refugees, concatenates into large dataframe 
returnee_filepaths = [f for f in listdir("../data/inflow/")]
ret_df = pd.concat((pd.read_csv("../data/inflow/"+f)
                  for f in returnee_filepaths), ignore_index=True)

CPU times: user 13.2 ms, sys: 6.61 ms, total: 19.8 ms
Wall time: 22 ms


In [14]:
diplacement_dict = {'Pre June14 Period of displacement':'disp_preJun14',
                      'June July14 Period of displacement':'disp_JunJuly14',
                      'August14 Period of displacement':'disp_Aug14',
                      'Post September 14 Period of displacement':'disp_postSep14',
                      'Post April15 Period of displacement':'disp_postApr15',
                      'Post March 16 Period of displacement':'disp_postMar16',
                      'Post 17 October 16 Period of displacement': 'disp_post17Oct16',
                      'July 17 Period of displacement':'disp_Jul17',
                      'Jan19':'disp_Jan19'}

In [15]:
ret_df.rename(columns=displacement_dict, inplace=True)
out_df.rename(columns=diplacement_dict, inplace=True)

In [16]:
ret_df.dropna(how='all', inplace=True)
ret_df['date'] = pd.to_datetime(ret_df['date'])

In [17]:
def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if isinstance(x, str) else x
    return df.applymap(trim_strings)



In [18]:
ret_df = trim_all_columns(ret_df)
out_df = trim_all_columns(out_df)

In [19]:
ret_df.sort_values(['Location ID', 'date'], inplace=True)

In [20]:
ret_df.dropna(how='any', inplace=True)

In [21]:
ret_df['ret_delta'] = ret_df.groupby(['Location ID'])['Returnee Families'].transform(lambda x: x.diff()) 



In [22]:
ret_df.sort_values(['Location ID', 'date'], inplace=True)
ret_df.reset_index(inplace=True)
ret_df.drop(columns='index', axis=1, inplace=True)

In [34]:
ret_df.columns

Index(['Location ID', 'Place_ID', 'Governorate', 'District', 'Location_name',
       'Arabic_name', 'Latitude', 'Longitude', 'Returnee Families',
       'Returnee Individuals', 'Anbar', 'Babylon', 'Baghdad', 'Basrah',
       'Dahuk', 'Diyala', 'Erbil', 'Kerbala', 'Kirkuk', 'Missan', 'Muthanna',
       'Najaf', 'Ninewa', 'Qadissiya', 'Salahal Din', 'Sulaymaniyah',
       'Thi Qar', 'Wassit', 'Camp', 'Habitual Pre_31_October2018',
       'Habitual Residence (Habitable)', 'Habitual Residence (Uninhabitable)',
       'Host_families', 'Hotel_Motel', 'Informal_settlements', 'Other',
       'Religious_building', 'Rented_houses', 'School_building',
       'Unfinished_Abandoned_building', 'Unknown_shelter_type',
       'Pre June14 Period of displacement',
       'June July14 Period of displacement', 'August14 Period of displacement',
       'Post September 14 Period of displacement',
       'Post April15 Period of displacement',
       'Post March 16 Period of displacement',
       'Post 17 Oct

In [23]:
#should be functionized 
for i in range(len(ret_df)):
    if np.isnan(ret_df.at[i, 'ret_delta']):
        ret_df.at[i, 'ret_delta'] = ret_df.at[i, 'Returnee Families']
    else:
        None

In [24]:
out_df.sort_values(['Location ID', 'date'], inplace=True)
out_df.reset_index(inplace=True)
out_df.drop(columns='index', axis=1, inplace=True)

In [25]:
master_df = ret_df.merge(out_df, how='outer', on=['Location ID', 'date', 'Governorate',
                                                 'District', 'Location_name'])

In [26]:
master_df.rename(columns={'Families':'outflow', 'ret_delta':'inflow'}, inplace=True)

In [28]:
master_df.columns

Index(['Location ID', 'Place_ID', 'Governorate', 'District', 'Location_name',
       'Arabic_name', 'Latitude_x', 'Longitude_x', 'Returnee Families',
       'Returnee Individuals', 'Anbar_x', 'Babylon_x', 'Baghdad_x', 'Basrah_x',
       'Dahuk_x', 'Diyala_x', 'Erbil_x', 'Kerbala_x', 'Kirkuk_x', 'Missan_x',
       'Muthanna_x', 'Najaf_x', 'Ninewa_x', 'Qadissiya_x', 'Salahal Din_x',
       'Sulaymaniyah_x', 'Thi Qar_x', 'Wassit_x', 'Camp_x',
       'Habitual Pre_31_October2018', 'Habitual Residence (Habitable)',
       'Habitual Residence (Uninhabitable)', 'Host_families', 'Hotel_Motel',
       'Informal_settlements', 'Other_x', 'Religious_building',
       'Rented_houses', 'School_building', 'Unfinished_Abandoned_building',
       'Unknown_shelter_type', 'Pre June14 Period of displacement',
       'June July14 Period of displacement', 'August14 Period of displacement',
       'Post September 14 Period of displacement',
       'Post April15 Period of displacement',
       'Post March 16 

In [None]:
master_df['date'] = master_df['date'].apply(lambda x: x.toordinal())
master_df.fillna(0, inplace=True)

In [None]:
X = master_df.drop(columns=['inflow'])
y = master_df['inflow']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [None]:
print("accuracy score:", rf.score(X_test, y_test))

In [None]:

print("\n10. precision:", precision_score(y_test, y_predict))
print("    recall:", recall_score(y_test, y_predict))



In [None]:
for ix, coord in enumerate(zip(conflict_df['latitude'], conflict_df['longitude'])):
    for idx, rcoord in enumerate(zip(ret_df['Latitude'], ret_df['Longitude'])):
        if geopy.distance.distance(coord, rcoord).km > 10:
            print(idx)
            ser[idx] += 1
        else:
            print(idx)