In [14]:
import pandas as pd
import requests, zipfile, io
from zipfile import ZipFile

Load in the main UCDP georeferenced data (1989-2020)

In [15]:
xy_file = ZipFile("data/input/ged221-csv.zip")

df = pd.read_csv(xy_file.open('GEDEvent_v22_1.csv'))

df.tail()


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
293629,15525,MZM-1989-3-1347-4,1989,1,Clear,3,498,562,Renamo - Civilians,498,...,1989-03-24 00:00:00.000,0,0,9,0,9,9,9,,
293630,15524,MZM-1989-3-1347-16,1989,1,Clear,3,498,562,Renamo - Civilians,498,...,1989-07-15 00:00:00.000,0,0,9,0,9,9,9,,
293631,15245,MZM-1990-3-1347-18,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,1990-06-10 00:00:00.000,0,0,7,0,7,7,7,,
293632,15516,MZM-1990-3-1347-28,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,1990-11-23 00:00:00.000,0,0,1,0,1,1,1,,
293633,15493,MZM-1990-3-1347-29,1990,1,Clear,3,498,562,Renamo - Civilians,498,...,1990-11-25 00:00:00.000,0,0,6,0,6,6,6,,


### Sense check 

Note that `id` is unique.

In [16]:
df[df[['id']].duplicated(keep=False)]

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob


Note that `relid` is unique.

In [17]:
df[df[['relid']].duplicated(keep=False)]

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob


Note that there are some rows where the deaths estimates dont make sense.

... where the `high` estimate is lower than the `low` estimate:

In [18]:
df[df['high']<df['low']].head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,date_end,deaths_a,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob
89,134354,AFG-1990-1-412-3.1,1990,1,Clear,1,333,333,Afghanistan: Government,726,...,1990-10-12 00:00:00.000,0,36,0,100,136,135,136,700,
206,145886,AFG-1993-1-412-52,1993,1,Clear,1,333,333,Afghanistan: Government,726,...,1993-10-14 00:00:00.000,0,0,0,53,53,33,53,700,
277,146345,AFG-1995-1-412-13.1,1995,1,Clear,1,333,333,Afghanistan: Government,726,...,1995-08-22 00:00:00.000,0,2,0,0,2,1,2,700,
527,186176,AFG-1989-1-413-1000,1989,1,Clear,1,333,333,Afghanistan: Government,732,...,1989-12-31 00:00:00.000,0,0,0,21,21,0,21,700,
4077,133321,AFG-2008-1-327-361.1,2008,1,Clear,1,333,333,Afghanistan: Government,735,...,2008-07-17 00:00:00.000,0,8,0,0,8,7,8,700,


I make an amended set of deaths data, where `high` estimates are swapped for `low` where `high`<`low`.

In [19]:
df['high_amend'] = df['high']

In [20]:
df.loc[(df.high<df.low), 'high_amend'] = df.loc[(df.high<df.low), 'low'] 

In [21]:
df['low_amend'] = df['low']

In [22]:
df.loc[(df.high<df.low), 'low_amend'] = df.loc[(df.high<df.low), 'high'] 

There are also rows where the `best` estimate falls outside the `low`-`high` range.

In [23]:
df[(df['best']<df['low_amend']) | (df['best']>df['high_amend'])].head()

Unnamed: 0,id,relid,year,active_year,code_status,type_of_violence,conflict_dset_id,conflict_new_id,conflict_name,dyad_dset_id,...,deaths_b,deaths_civilians,deaths_unknown,best,high,low,gwnoa,gwnob,high_amend,low_amend
437,133990,AFG-1989-1-760-41.1,1989,1,Clear,1,333,333,Afghanistan: Government,727,...,18,3,0,22,21,0,700,,21,0
458,134076,AFG-1989-1-760-57.3,1989,1,Clear,1,333,333,Afghanistan: Government,727,...,0,0,78,85,84,0,700,,84,0
7696,168597,AFG-2010-1-327-2484.1,2010,1,Clear,1,333,333,Afghanistan: Government,735,...,3,0,0,3,2,2,700,,2,2
9791,186137,AFG-2011-1-327-3078.3,2011,1,Clear,1,333,333,Afghanistan: Government,735,...,3,0,0,3,4,4,700,,4,4
14967,163028,AFG-2014-1-327-1655.2,2014,1,Clear,1,333,333,Afghanistan: Government,735,...,1,0,0,4,11,5,700,,11,5


I extend the (amended) `high` or `low` values to include the `best` estimate.

In [24]:
df.loc[(df.best<df.low_amend), 'low_amend'] = df.loc[(df.best<df.low_amend), 'best'] 

In [25]:
df.loc[(df.best>df.high_amend), 'high_amend'] = df.loc[(df.best>df.high_amend), 'best'] 

### Output as csv

In [26]:
df.to_csv('data/manipulation/tidied_geolocated_data.csv.gz', index = False, compression = "gzip")