In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
from ipywidgets import interact

cf.go_offline()

In [None]:
#reading my data files:
africa = pd.read_excel('../data/Africa_1997-2020_Mar21.xlsx')
latinamerica = pd.read_excel('../data/LatinAmerica_2018-2020_Mar21.xlsx')
middleeast = pd.read_excel('../data/MiddleEast_2015-2020_Mar21-1.xlsx')
demographics = pd.read_excel('../data/country_profile_variables.xlsx')

#testing they are all working
# africa.head()
# latinamerica.head()
# middleeast.head()
# demographics.head()

# demographics.dtypes

#checking for nan values
#null_cols = demographics.isna().sum()
#null_cols[null_cols > 0]

#there is a lot, but as I don't know which countries I'll be using, I'll leave it for now

In [None]:
# select events only from 2019 
year_africa = africa.loc[africa.YEAR == 2019]
year_latinamerica = latinamerica.loc[latinamerica.YEAR == 2019]
year_middleeast = middleeast.loc[middleeast.YEAR == 2019]

# adding column to signify continent
year_africa['continent'] = 'Africa'
year_latinamerica['continent'] = 'LatinAmerica'
year_middleeast['continent'] = 'MiddleEast'

#concatenating all countries + demographics in one file
percountry_df = pd.concat((year_africa,year_latinamerica,year_middleeast))

percountry_df.columns

In [None]:
#want to know event types per country
eventpercountry_df = percountry_df.pivot_table(index=('continent','COUNTRY'),columns='EVENT_TYPE',values='FATALITIES', aggfunc=(len))

# replace nan by zeros
eventpercountry_df = eventpercountry_df.fillna(0)

# and total events and fatalities per country
fatalitiespercountry_df = percountry_df.pivot_table(index='COUNTRY',values='FATALITIES', aggfunc=(len,sum))
fatalitiespercountry_df = fatalitiespercountry_df.rename(columns={'len': 'total conflicts 2019', 'sum': 'total fatalities 2019'})

# merge everything to have one file
df = pd.merge(eventpercountry_df, demographics, left_on ="COUNTRY" , right_on = "country")
df = pd.merge(df, fatalitiespercountry_df, left_on ="country" , right_on = "COUNTRY")
df = pd.merge(df, percountry_df[['continent','COUNTRY']], left_on ="country" , right_on = "COUNTRY")

#checking for nan values
null_cols = df.isna().sum()
null_cols[null_cols > 0]
# still a lot of nan values on health and education

df.columns

In [None]:
# adding a few new colums per capita %
df['total conflicts 2019 per capita'] = df['total conflicts 2019'] / df['Population in thousands (2017)']
df['total fatalities 2019 per capita'] = df['total fatalities 2019'] / df['Population in thousands (2017)']
df['total Protests 2019 per capita'] = df['Protests'] / df['Population in thousands (2017)']
df['total Explosions/Remote violence 2019 per capita'] = df['Explosions/Remote violence'] / df['Population in thousands (2017)']
df['total Battles 2019 per capita'] = df['Battles'] / df['Population in thousands (2017)']


In [None]:
#create new files:
df.to_csv("../data/df.csv") 
percountry_df.to_csv("../data/percountry_df.csv") 
