In [25]:
import pandas as pd
import gzip



In [56]:
gz_file = gzip.open('data/manipulation/ucdp_with_country.csv.gz', 'rb')

df = pd.read_csv(gz_file)
df.tail()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,index,relid,year,geom_wkt,longitude,latitude,type_of_violence,best,low_amend,high_amend,conflict_new_id,country,country_id,region,OWID_country_name,method,flag
280250,1683,NIG-2022-2-2-XXX475-9,2022,POINT (3.395833 6.453056),3.395833,6.453056,2,2,2,2,119,Nigeria,475,Africa,Nigeria,Nearest,
280251,1684,UKR-2022-1-14117-20,2022,POINT (30.203448 45.254159),30.203448,45.254159,1,0,0,13,13243,Ukraine,369,Europe,Ukraine,Nearest,
280252,1685,UKR-2022-1-14117-99,2022,POINT (31.54505 46.61472),31.54505,46.61472,1,2,2,2,13243,Ukraine,369,Europe,Ukraine,Nearest,
280253,1686,UKR-2022-1-14117-32,2022,POINT (32.61458 46.63695),32.61458,46.63695,1,2,2,2,13243,Ukraine,369,Europe,Ukraine,Nearest,
280254,1687,UKR-2022-1-14117-62.2,2022,POINT (32.6151 46.63737),32.6151,46.63737,1,4,4,4,13243,Ukraine,369,Europe,Ukraine,Nearest,


### Aggregate 

A function that aggregates specified columns by a specified group and tidies up the indexes/column names

In [99]:
def my_aggregation(df, group_by_list, aggregate_over_list, agg_fun):
        
    df_agg = df.groupby(group_by_list)[aggregate_over_list].agg([agg_fun])
    
    df_agg.columns = [' '.join(col).strip() for col in df_agg.columns.values]

    df_agg = df_agg.reset_index()
        
    return df_agg

In [102]:
df_country_agg_deaths = my_aggregation(
    df = df,
    group_by_list = ['OWID_country_name','type_of_violence', 'year'],
    aggregate_over_list = ['best', 'low_amend', 'high_amend'],
    agg_fun = 'sum'
)

df_country_agg_deaths.head()

Unnamed: 0,OWID_country_name,type_of_violence,year,best count,low_amend count,high_amend count
0,Afghanistan,1,1989,137,137,137
1,Afghanistan,1,1990,39,39,39
2,Afghanistan,1,1991,60,60,60
3,Afghanistan,1,1992,69,69,69
4,Afghanistan,1,1993,105,105,105


Aggregate high, low and best deaths estimates by type of violence, year...

...and by country

In [27]:
df_country_agg_deaths = df[['OWID_country_name','type_of_violence', 'year', 'best', 'low_amend', 'high_amend']]\
    .groupby(['OWID_country_name','type_of_violence', 'year'])\
    .sum()\
    .reset_index()

df_country_agg_deaths.rename(columns = {"OWID_country_name":"entity"}, inplace = True)

df_country_agg_deaths.head()

Unnamed: 0,entity,type_of_violence,year,best,low_amend,high_amend
0,Afghanistan,1,1989,5174,1883,17074
1,Afghanistan,1,1990,1478,1322,2276
2,Afghanistan,1,1991,3302,3156,3802
3,Afghanistan,1,1992,4287,4270,5471
4,Afghanistan,1,1993,4071,4043,8205


...by region (according to UCDP region definitions)

In [28]:
df_region_agg_deaths = df[['region','type_of_violence', 'year', 'best', 'low_amend', 'high_amend']]\
    .groupby(['region','type_of_violence', 'year'])\
    .sum()\
    .reset_index()

df_region_agg_deaths.rename(columns = {"region":"entity"}, inplace = True)

df_region_agg_deaths.head()

Unnamed: 0,entity,type_of_violence,year,best,low_amend,high_amend
0,Africa,1,1989,34546,34516,47458
1,Africa,1,1990,64847,62499,78847
2,Africa,1,1991,25510,23653,35629
3,Africa,1,1992,6922,6814,23260
4,Africa,1,1993,15091,15084,27250


... World aggregate

In [29]:
df_world_agg_deaths = df[['type_of_violence', 'year', 'best', 'low_amend', 'high_amend']]\
    .groupby(['type_of_violence', 'year'])\
    .sum()\
    .reset_index()

df_world_agg_deaths['entity'] = "World"

df_world_agg_deaths.head()

Unnamed: 0,type_of_violence,year,best,low_amend,high_amend,entity
0,1,1989,54414,50204,84565,World
1,1,1990,80023,76816,97591,World
2,1,1991,70364,68075,91497,World
3,1,1992,50160,46702,75324,World
4,1,1993,37147,36204,59363,World


Append aggregates together

In [31]:
frames = [df_country_agg_deaths, df_region_agg_deaths, df_world_agg_deaths]

df_agg = pd.concat(frames)
df_agg.head()

Unnamed: 0,entity,type_of_violence,year,best,low_amend,high_amend
0,Afghanistan,1,1989,5174,1883,17074
1,Afghanistan,1,1990,1478,1322,2276
2,Afghanistan,1,1991,3302,3156,3802
3,Afghanistan,1,1992,4287,4270,5471
4,Afghanistan,1,1993,4071,4043,8205


### Aggregate the number of conflicts

Count conflict ids by type of violence, year...

...and by country

In [None]:
df_country_agg_deaths = df[['OWID_country_name','type_of_violence', 'year', 'conflict_new_id']]\
    .groupby(['OWID_country_name','type_of_violence', 'year'])\
    .sum()\
    .reset_index()

df_country_agg_deaths.rename(columns = {"OWID_country_name":"entity"}, inplace = True)

df_country_agg_deaths.head()

df.groupby('column_name').count().


Add an 'all types of violence category', summing over the violence types.