In [None]:
#importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import chart_studio.plotly as py
import cufflinks as cf
from ipywidgets import interact

cf.go_offline()

In [None]:
#reading my data files:
africa = pd.read_excel('../data/Africa_1997-2020_Mar21.xlsx')
latinamerica = pd.read_excel('../data/LatinAmerica_2018-2020_Mar21.xlsx')
middleeast = pd.read_excel('../data/MiddleEast_2015-2020_Mar21-1.xlsx')
demographics = pd.read_excel('../data/country_profile_variables.xlsx')

#testing they are all working
# africa.head()
# latinamerica.head()
# middleeast.head()
# demographics.head()

# demographics.dtypes

#checking for nan values
null_cols = demographics.isna().sum()
#null_cols[null_cols > 0]

#there is a lot, but as I don't know which countries I'll be using, I'll leave it for now

In [None]:
# select events only from 2019 
year_africa = africa.loc[africa.YEAR == 2019]
year_latinamerica = latinamerica.loc[latinamerica.YEAR == 2019]
year_middleeast = middleeast.loc[middleeast.YEAR == 2019]

# adding column to signify continent
year_africa['continent'] = 'Africa'
year_latinamerica['continent'] = 'LatinAmerica'
year_middleeast['continent'] = 'MiddleEast'

#concatenating all countries + demographics in one file
percountry_df = pd.concat((year_africa,year_latinamerica,year_middleeast))

percountry_df.columns

In [None]:
# initial chart to see data
df_cont_event =percountry_df[['continent', 'COUNTRY', 'EVENT_TYPE', 'EVENT_DATE']]
df_cont_event

@interact(Continent=list(df_cont_event['continent'].unique()))
         
def dyn_linec(Continent):
    data = df_cont_event[(df_cont_event['continent'] == Continent)]
    
    data.pivot_table(index=['COUNTRY'], columns=['continent', 'EVENT_TYPE'], aggfunc=len).iplot(
    kind='bar',xTitle='Country', yTitle='Total Events',title='Events per country')


In [None]:
#want to know event types per country
eventpercountry_df = percountry_df.pivot_table(index=('continent','COUNTRY'),columns='EVENT_TYPE',values='FATALITIES', aggfunc=(len))

# replace nan by zeros
eventpercountry_df = eventpercountry_df.fillna(0)

# and total events and fatalities per country
fatalitiespercountry_df = percountry_df.pivot_table(index='COUNTRY',values='FATALITIES', aggfunc=(len,sum))
fatalitiespercountry_df = fatalitiespercountry_df.rename(columns={'len': 'total conflicts 2019', 'sum': 'total fatalities 2019'})

# merge everything to have one file
df = pd.merge(eventpercountry_df, demographics, left_on ="COUNTRY" , right_on = "country")
df = pd.merge(df, fatalitiespercountry_df, left_on ="country" , right_on = "COUNTRY")
df = pd.merge(df, percountry_df[['continent','COUNTRY']], left_on ="country" , right_on = "COUNTRY")

#checking for nan values
null_cols = df.isna().sum()
null_cols[null_cols > 0]
# still a lot of nan values on health and education

df.columns

In [None]:
# adding a few new colums per capita %
df['total conflicts 2019 per capita'] = df['total conflicts 2019'] / df['Population in thousands (2017)']
df['total fatalities 2019 per capita'] = df['total fatalities 2019'] / df['Population in thousands (2017)']
df['total Protests 2019 per capita'] = df['Protests'] / df['Population in thousands (2017)']
df['total Explosions/Remote violence 2019 per capita'] = df['Explosions/Remote violence'] / df['Population in thousands (2017)']
df['total Battles 2019 per capita'] = df['Battles'] / df['Population in thousands (2017)']

#trying to find some correlations in all the data
corr = df.corr()
corr

In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=np.bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(20, 15))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
# things I found interesting from above correlation graph:
# 1 - negative correlation between GDP growth and total conflicts and fatalities, decreases for protest
# 2 - Education has a low to none correlation between fatalities, but increases for protests, thus more education more protests
# 3 - all Armed conflict lines are negatively correlated with labor force participation female pop %


In [None]:
# 1 - negative correlation between GDP growth and total conflicts and fatalities, decreases for protest
df[['country',
    'total conflicts 2019 per capita', 
    'total fatalities 2019 per capita',
    'total Protests 2019 per capita',
    'total Explosions/Remote violence 2019 per capita',
    'total Battles 2019 per capita',
    'GDP growth rate (annual %, const. 2005 prices)']].corr()

In [None]:
# 1 - negative correlation between GDP growth and total conflicts and fatalities, decreases for protest
explpergdp = df[['country','total Explosions/Remote violence 2019 per capita','GDP growth rate (annual %, const. 2005 prices)']]

explpergdp.iplot( x='total Explosions/Remote violence 2019 per capita', y='GDP growth rate (annual %, const. 2005 prices)', 
                 categories='country',xTitle='total Explosions/Remote violence per capita', yTitle='GDP growth per year', 
                 title='Impact of war on GDP growth')


In [None]:
# 2 - Education has an inverse correlation to fatalities, but increases for protests, 
# thus more education more protests & less fatalities?
df[['country',
    'total conflicts 2019 per capita', 
    'total fatalities 2019 per capita',
    'total Protests 2019 per capita',
    'total Explosions/Remote violence 2019 per capita',
    'total Battles 2019 per capita',
    'Education: Primary gross enrol. ratio (f per 100 pop.)',
    'Education: Primary gross enrol. ratio (m per 100 pop.)',
    'Education: Secondary gross enrol. ratio (f per 100 pop.)',
    'Education: Secondary gross enrol. ratio (m per 100 pop.)',
    'Education: Tertiary gross enrol. ratio (f per 100 pop.)',
    'Education: Tertiary gross enrol. ratio (m per 100 pop.)']].corr()

eduvsprotes = df[['country','total Protests 2019 per capita','total fatalities 2019 per capita','Education: Tertiary gross enrol. ratio (f per 100 pop.)']] 

@interact(Selection=['total Protests 2019 per capita', 'total fatalities 2019 per capita'])
def linechart(Selection):
     eduvsprotes.iplot(kind='scatter',x=Selection, xTitle=Selection.title(),
                       y='Education: Tertiary gross enrol. ratio (f per 100 pop.)',
                       yTitle='terciary education (f per 100 pop.)',
                       categories='country',
                       title='Education vs Protests' + Selection.title())

In [None]:
# 3 - all Armed conflict lines are negatively correlated with labor force participation female pop %
labvscon = df[['continent','total conflicts 2019 per capita', 
    'total fatalities 2019 per capita',
    'total Protests 2019 per capita',
    'total Explosions/Remote violence 2019 per capita',
    'total Battles 2019 per capita',
    'Labour force participation (female pop. %)']]

#sns.distplot(labvscon['Labour force participation (female pop. %)'])


#one histogram, one box plot