In [43]:
import numpy as np
import pandas as pd
import seaborn as sns
import xarray as xr

% matplotlib inline

In [12]:
import os
os.listdir()

ing_file = 'conflict_ing_adm0_adm1_list_of_countries.csv' # ING for intergroup conflict
inp_adm0_file = 'conflict_inp_adm0_list_of_countries.csv' # INP for interpersonal conflict
inp_adm1_file = 'conflict_inp_adm1_list_of_countries.csv' # INP for interpersonal conflict

#filepath = os.getcwd() + "/{0}".format(filename)
_filepath = "."+ "/{0}"

# Conflict 
#### Sector lead- Jingyuan

**Task** Determine list of countries and years we absolutely need socioeconomic data for **<span style="color:black; background-color: lightgreen">DONE</span>**

    i.e. years and countries we have outcome data for

Columns # notes directly from Jingyuan
* `country_id`: unique id for each country. There may be duplicates. If a country changes to another country due to revolution, two countries have the same `country_id`. Ex. Soviet Union and Russia have the same `country_id`. NOT Uniform across sectors.

* `iso`: unique country code. All sectors use the same `iso`(international organization standard) country code. Ex. Soviet Union and Russia have different `iso` code..and countries like Germany, Sudan.. with similiar political changes. UNIFORM across sectors. 

* `name_engli`: [Str] country name in English 
* `year_adm0covariates`: years we need for country level covariates (gdp_pc, population, ...) 
* `year_adm1covariates`

**Note:**

`gdppc` = GDP per capita (total GDP / population for each country. ie. GDP per unit of analysis, a person living in the country)

**<span style="color:darkgreen; background-color: yellow">Conclusion</span>**

* Intergroup conflict

    * countries(iso): 261 (To get values: `ing_data.iso`)
    * countries (country_id): 257 + 4 duplicates
        Duplicates have `country_id` of (iso in parens):
        * 60: Germany (DEU), West Germany (FDE)
        * 72: Ethiopia (ETH), Ethiopia (FET)
        * 104: Indonesia (IDN), Indonesia (FID)
        * 191: Soviet Union (SUN), Russia (RUS)
                  
    * years (ADM0/national): `1946-2016` for all countries
    * years (ADM1/subnational): `1989-2016` for all countries

* Interpersonal conflict (ADM0 level only)

    * countries: 58
    * years: 1995-2012

**Questions**
1. Are there additional country level covariates other than gdppc and population only?



### A-Intergroup Conflict

In [57]:
# Open data 
ing_filepath = _filepath.format(ing_file)
ing_data= pd.read_csv(ing_filepath)

open_data = lambda filename: pd.read_csv(_filepath.format(filename))

In [58]:
ing_data = open_data(ing_file)
# Exploratory data analsyis
ing_data

# get list of columns (type-pandas Index) 
# -> 'country_id', 'iso', 'name_engli', 'year_adm0covariates', 'year_adm1covariates'
ing_data.columns

# two ways of accessing column
# df['col_name']
# and df.col_name
(ing_data['country_id'] == ing_data.country_id).all() #-> returns True (equivalent column)

# get summary statistics s.a. total count, mean, stsd, 25%, ..., 75% and max value
# ing_data.describe() 


True

In [87]:
num_iso = ing_data.iso.nunique() #261 
num_country = ing_data.country_id.nunique() # 257

num_iso - num_country # 4 countries are not unique


# get rows which have duplicate values
ing_data.loc[np.where(ing_data.country_id.duplicated() == True)]

Unnamed: 0,country_id,iso,name_engli,year_adm0covariates,year_adm1covariates
60,60,FDE,West Germany,1946-2016,1989-2016
73,72,FET,Ethiopia,1946-2016,1989-2016
106,104,FID,Indonesia,1946-2016,1989-2016
194,191,SUN,Soviet Union,1946-2016,1989-2016


In [125]:
ing_data.loc[[59,60]]
# get countries with duplicate country_id
ing_data.loc[np.where(ing_data['country_id'] == 60)] 
# 59-Germany (ISO-FDE)
# 60-West Germany

Unnamed: 0,country_id,iso,name_engli,year_adm0covariates,year_adm1covariates
59,60,DEU,Germany,1946-2016,1989-2016
60,60,FDE,West Germany,1946-2016,1989-2016


In [128]:
ing_data.loc[np.where(ing_data['country_id'] == 72)]
# 72-Ethiopia (ISO-ETH)
# 73-Ethiopia (ISO-FET)


Unnamed: 0,country_id,iso,name_engli,year_adm0covariates,year_adm1covariates
72,72,ETH,Ethiopia,1946-2016,1989-2016
73,72,FET,Ethiopia,1946-2016,1989-2016


In [127]:
ing_data.loc[[105,106]]
# 105-Indonesia (ISO-IND)
# 106-Indonesia (ISO-FID)



Unnamed: 0,country_id,iso,name_engli,year_adm0covariates,year_adm1covariates
105,104,IDN,Indonesia,1946-2016,1989-2016
106,104,FID,Indonesia,1946-2016,1989-2016


In [123]:
# Soviet Union
ing_data.loc[[194,193]]
# 194 - Sovient Union (ISO-SUN)
# 193 - Russia (ISO-RUS)

Unnamed: 0,country_id,iso,name_engli,year_adm0covariates,year_adm1covariates
194,191,SUN,Soviet Union,1946-2016,1989-2016
193,191,RUS,Russia,1946-2016,1989-2016


In [None]:
ing_data.loc[[105,106]]
# 105-Indonesia (ISO-IND)
# 106-Indonesia (ISO-FID)


In [59]:
# determine # unique values
ing_data.year_adm0covariates.nunique()
ing_data.year_adm1covariates.nunique() 

1

In [60]:
# determine unique values
print(ing_data.year_adm0covariates.unique())
print(ing_data.year_adm1covariates.unique())

['1946-2016']
['1989-2016']


### B - Interpersonal Conflict

**<span style="color:darkgreen; background-color: yellow">Conclusion</span>**

* countries: 58
* years: 1995-2012

In [62]:
inp_data = open_data(inp_file)


In [65]:
inp_data.describe()
# 58 unique countries
# single unique year_adm0covariates value

inp_data.year_adm0covariates.unique() 
# ['1995-2012'],

array(['1995-2012'], dtype=object)

In [130]:
inp_data

Unnamed: 0,iso,countryname,year_adm0covariates
0,AUS,Australia,1995-2012
1,AUT,Austria,1995-2012
2,BEL,Belgium,1995-2012
3,BGD,Bangladesh,1995-2012
4,BHS,"Bahamas, The",1995-2012
5,BLZ,Belize,1995-2012
6,BRB,Barbados,1995-2012
7,CAN,Canada,1995-2012
8,CHE,Switzerland,1995-2012
9,COL,Colombia,1995-2012
