In [93]:
import requests
import json
import time
from datetime import date
from datetime import datetime
from helpers.history_fetcher import HistoryFetcher
import dateutil.parser
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gdelt # pip install gdelt
import folium
import os
from tqdm import tqdm_notebook # conda install tqdm
import operator
from helpers.wiki_helpers import get_stability_for_country, make_folium_map, wiki_change_factor

import warnings
warnings.filterwarnings('ignore')

import itertools

%load_ext autoreload
%autoreload 2

pd.options.mode.chained_assignment = None

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Gather GDELT data
#### EXECUTE ONLY ON THE SERVER (memory issues)

In [None]:
### this code is executed on the big server to obtain the results from 2011 to 2018 divided into 3 files per year
### for size.

import numpy as np
import pandas as pd
import gdelt # pip install gdelt
import gc

gd1 = gdelt.gdelt(version=1)
for i in range(11, 18):
    print('year : ', i , 1)
    results = gd1.Search(['20'+str(i)+' January 01','20'+str(i)+' May 01'],table='events')
    cols_to_keep = ['SQLDATE','ActionGeo_CountryCode','QuadClass' ]
    results = results[cols_to_keep]
    pd.to_pickle(results, 'gdelt_filtered_QuadClass_20'+str(i)+'_1.pckl')
    del results
    gc.collect()
    print('year : ', i , 2)
    results = gd1.Search(['20'+str(i)+' May 01','20'+str(i)+' August 01'],table='events')
    results = results[cols_to_keep]
    pd.to_pickle(results, 'gdelt_filtered_QuadClass_20'+str(i)+'_2.pckl')
    del results
    gc.collect()
    print('year : ', i , 3)
    if(i != 17):
        results = gd1.Search(['20'+str(i)+' August 01','20'+str(i+1)+' January 01'],table='events')
    else:
        results = gd1.Search(['20'+str(i)+' August 01','20'+str(i)+' December 01'],table='events')
    results = results[cols_to_keep]
    pd.to_pickle(results, 'gdelt_filtered_QuadClass_20'+str(i)+'_3.pckl')
    del results
    gc.collect()

### Process GDELT Data (aggregate) 
#### EXECUTE ONLY ON THE SERVER (memory issues)

In [None]:
from os import listdir
from os.path import isfile, join
import gc

def process_files():
    onlyfiles = [f for f in listdir('.') if isfile(join('.', f))]
    
    grouped = pd.DataFrame()
    for i, file in enumerate(onlyfiles):
        print('processing ', file)
        gc.collect()
        try:
            data = pd.read_pickle(file)
            data['Counter'] = 1.0
            data = data[['SQLDATE','ActionGeo_CountryCode', 'QuadClass', 'Counter']]
        except:
            continue;
        grouped = grouped.append([data.groupby(['SQLDATE','ActionGeo_CountryCode', 'QuadClass']).sum()])
        pd.to_pickle(grouped, str(i)+'_QuadClass.pckl')
        del data

### Show the gathered data, save to the aggregated pickle file

In [83]:
QuadClass_event_dic = {1:'Verbal Cooperation', 
                   2:'Material Cooperation', 
                   3:'Verbal Conflict', 
                   4:'Material Conflict', 
                   np.NaN:'',
                   0:'UNKNOWN'}


aggregated_gdelt = pd.read_pickle('data/20_QuadClass.pckl')
# clean the dataframe
aggregated_gdelt.reset_index(inplace=True)
# change names
aggregated_gdelt['QuadClass'] = aggregated_gdelt['QuadClass'].map(QuadClass_event_dic)
aggregated_gdelt.sort_values('SQLDATE', inplace=True)
# take only values above 2011
aggregated_gdelt = aggregated_gdelt[aggregated_gdelt['SQLDATE'] > 20110101]
# convert to Timestamp (takes lots of time)
aggregated_gdelt['SQLDATE'] = aggregated_gdelt['SQLDATE'].apply(lambda x: str(x) )
aggregated_gdelt['SQLDATE'] = aggregated_gdelt['SQLDATE'].apply(lambda x: pd.Timestamp(x).to_pydatetime())
pd.to_pickle(aggregated_gdelt, 'data/aggregated_gdelt_QuadClass.pckl')
aggregated_gdelt.head(20)

Unnamed: 0,SQLDATE,ActionGeo_CountryCode,QuadClass,Counter
151776,2011-01-02,NL,Material Conflict,27.0
151778,2011-01-02,NO,Material Cooperation,4.0
151779,2011-01-02,NO,Verbal Conflict,11.0
151777,2011-01-02,NO,Verbal Cooperation,41.0
151780,2011-01-02,NO,Material Conflict,12.0
151781,2011-01-02,NP,Verbal Cooperation,140.0
151782,2011-01-02,NP,Material Cooperation,18.0
151783,2011-01-02,NP,Verbal Conflict,39.0
151757,2011-01-02,MX,Verbal Conflict,33.0
151785,2011-01-02,NS,Verbal Cooperation,2.0


### Combining all the UNData frames together basic info about the countries) 

In [91]:

nr_internet_people = pd.read_csv('data/nr_people_internet.csv')
cols_to_keep = ['Country or Area','Value']
nr_internet_people = nr_internet_people[cols_to_keep]
nr_internet_people.columns=['Country','Internet Users%']
nr_internet_people.set_index('Country',inplace=True)

nr_all_people = pd.read_csv('data/nr_people_all.csv')
nr_all_people = nr_all_people[nr_all_people['Variant']=='Medium'][cols_to_keep]
cols_to_keep = ['Country or Area','Value']
nr_all_people = nr_all_people[cols_to_keep]
nr_all_people.columns=['Country','Population[k]']
nr_all_people.set_index('Country',inplace=True)

population_data = pd.concat([nr_all_people, nr_internet_people], axis=1, join='inner')
population_data['Internet users[k]'] = population_data['Population[k]'] * population_data['Internet Users%']/100
population_data.head(5)


countries_data = pd.read_csv('data/country-codes.csv')[1:]
cols_to_keep = ['official_name_en','ISO3166-1-Alpha-3','ISO3166-1-Alpha-2' ,'Least Developed Countries (LDC)','Region Name',
                'Small Island Developing States (SIDS)', 'is_independent']
countries_data = countries_data[cols_to_keep]

cols_to_keep = ['official_name_en','ISO3166-1-Alpha-3','ISO3166-1-Alpha-2' ,'Region Name']

countries_data = countries_data[cols_to_keep]
countries_data.columns = ['Country', 'Code','Code2','Region']
countries_data['Wiki Stability MLE'] = np.NaN;
countries_data['Wiki Instability old'] = np.NaN;
countries_data['Wiki mean change day'] = np.NaN;
countries_data.set_index('Country',inplace=True)


countries_data = pd.concat([countries_data, population_data], axis=1, join='inner')
countries_data['Big Internet'] = countries_data['Internet users[k]'] > 1000
countries_data.reset_index(inplace=True)

gdp_data = pd.read_csv('data/UNdata_GDP.csv')
keep_cols = ['Country or Area', 'Value']

gdp_data = gdp_data[keep_cols]
gdp_data.columns = ['Country', 'GDP']
gdp_data.set_index('Country', inplace=True)
#countries_data.reset_index(inplace=True)
countries_data.set_index('Country', inplace=True)
countries_data = pd.concat([countries_data, gdp_data], axis=1, join='inner')
countries_data.reset_index(inplace=True)

countries_data['GDP per capita'] = countries_data['GDP'] / countries_data['Population[k]']

countries_data.head()

Unnamed: 0,Country,Code,Code2,Region,Wiki Stability MLE,Wiki Instability old,Wiki mean change day,Population[k],Internet Users%,Internet users[k],Big Internet,GDP,GDP per capita
0,Afghanistan,AFG,AF,Asia,,,,35530.081,8.26,2934.784691,True,623.184798,0.01754
1,Albania,ALB,AL,Europe,,,,2930.187,63.252933,1853.429211,True,3984.234302,1.35972
2,Algeria,DZA,DZ,Africa,,,,41318.142,38.2,15783.530244,True,4154.118319,0.10054
3,Andorra,AND,AD,Europe,,,,76.965,96.91,74.586782,False,39896.376936,518.370388
4,Angola,AGO,AO,Africa,,,,29784.193,12.4,3693.239932,True,4714.065956,0.158274


### Gathering the stability / instability data (#1 and #2) into the countries_data dataframe
#### saving file every iteration, 
#### it takes ~ 3h to fetch all the data from the internet

In [31]:

countries_data = pd.read_pickle('datacountries_data.pckl')
countries = countries_data['Country'].values

countries[countries == 'Syrian Arab Republic'] = 'Syria'
countries[163] = 'United States'


codes = countries_data['Code'].values

for i in tqdm_notebook(range(len(countries))):
    # if not already fetched
    
    start = '20130101000000'
    stop = '20171101000000'
    try:
        if((np.isnan(countries_data[countries_data['Country']==countries[i] ]['Wiki Stability MLE'].values[0]))):
            print('fetching',countries[i])
            stab_factor, mean = get_stability_for_country( countries[i], start, stop)
            instab_factor = wiki_change_factor( countries[i], 2013, 2017, outlier_factor=3.4) 
            print(countries[i],'stab:',stab_factor,'instab',instab_factor,'mean', mean)
            countries_data[i:i+1]['Wiki Stability MLE'] = stab_factor
            countries_data[i:i+1]['Wiki Instability old'] = instab_factor
            countries_data[i:i+1]['Wiki mean change day'] = mean
            # save the values every time
            pd.to_pickle(countries_data, 'data/countries_data.pckl')
            
        else:
            print(countries[i],'Done already')
    except KeyError:
        break;
    except:
        print('skipped: ',countries[i], e)


Processing  AA - Aruba
Processed  Aruba corr :  0.00297966381665
Processing  AC - Antigua and Barbuda
Processed  Antigua and Barbuda corr :  0.244411343431
Processing  AE - United Arab Emirates
Processed  United Arab Emirates corr :  0.0230502608718
Processing  AF - Afghanistan
Processed  Afghanistan corr :  0.15006895486
Processing  AG - Algeria
Processed  Algeria corr :  0.512626335958
Processing  AJ - Azerbaijan
Processed  Azerbaijan corr :  0.144147357664
Processing  AL - Albania
Processed  Albania corr :  0.0437615519295
Processing  AM - Armenia
Processed  Armenia corr :  0.000452808221881
Processing  AN - Andorra
Processed  Andorra corr :  0.0558129880402
Processing  AO - Angola
Processed  Angola corr :  0.220731012676
Processing  AR - Argentina
Processed  Argentina corr :  0.00176887707684
Processing  AS - Australia
Processed  Australia corr :  0.215757339925
Processing  AU - Austria
Processed  Austria corr :  0.190185443348
Processing  AV - nan
Processed  AV  NaN CODE 
Processi

Processed  Nicaragua corr :  0.140946181869
Processing  NZ - New Zealand
Processed  New Zealand corr :  0.103000339137
Processing  PA - Paraguay
Processed  Paraguay corr :  0.0803739268951
Processing  PE - Peru
Processed  Peru corr :  0.470972845416
Processing  PK - Pakistan
Processed  Pakistan corr :  0.0872624420383
Processing  PL - Poland
Processed  Poland corr :  0.37415325838
Processing  PM - Panama
Processed  Panama corr :  0.000231725465687
Processing  PO - Portugal
Processed  Portugal corr :  0.253703226538
Processing  PP - Papua New Guinea
Processed  Papua New Guinea corr :  0.304630708834
Processing  PU - Guinea-Bissau
Processed  Guinea-Bissau corr :  0.361201569268
Processing  QA - Qatar
Processed  Qatar corr :  0.0339311480135
Processing  RM - Marshall Islands
Processed  Marshall Islands corr :  0.041244309376
Processing  RO - Romania
Processed  Romania corr :  0.339121545976
Processing  RP - Philippines
Processed  Philippines corr :  0.280969747587
Processing  RQ - Puerto 

### Fetching the correlation data for the countries
##### this process takes ~ 3h

In [144]:


countries_data['Pearson Corr QuadClass1'] = np.NaN
countries_data['Pearson Corr QuadClass2'] = np.NaN
countries_data['Pearson Corr QuadClass3'] = np.NaN
countries_data['Pearson Corr QuadClass4'] = np.NaN



for i, cntr_code in enumerate(sorted(list(set(cntr_codes)))):
    pd.to_pickle(countries_data,'countries_data.pckl')
    try:
        prev_val = countries_data[countries_data['FIPS']==cntr_code]['Pearson Corr QuadClass1'].values[0]
    except:
        continue
    if str(prev_val).upper() == 'NAN':
        
        name = countries_data[countries_data['FIPS']==cntr_code]['Country'].values
        if(len(name) == 0):
            print('Processing ', cntr_code, 'NO CODE FOUND')

        print('Processing ', cntr_code, '-', name[0])

        if  len(str(name[0])) > 3:
            corr, _ = analyse_wiki_events_correlation_QuadClass(cntr_code,name[0],'20110101000000', '20171215000000');
            countries_data.ix[countries_data['FIPS']==cntr_code,'Pearson Corr QuadClass1'] = corr[0] ;
            countries_data.ix[countries_data['FIPS']==cntr_code,'Pearson Corr QuadClass2'] = corr[1] ;
            countries_data.ix[countries_data['FIPS']==cntr_code,'Pearson Corr QuadClass3'] = corr[2] ;
            countries_data.ix[countries_data['FIPS']==cntr_code,'Pearson Corr QuadClass4'] = corr[3] ;
            print('Processed ', name[0] , 'corr : ', corr)

        else:
            print('Processed ', cntr_code , ' NaN CODE ')
    else:
        print(cntr_code,'Already processed.')
        

Processing  AA - Aruba
[ 0.00263424  0.01465992 -0.02288378  0.0155933 ]
Processed  Aruba corr :  [ 0.00263424  0.01465992 -0.02288378  0.0155933 ]
Processing  AC - Antigua and Barbuda
[ 0.23580558  0.27464914  0.14103313  0.25541004]
Processed  Antigua and Barbuda corr :  [ 0.23580558  0.27464914  0.14103313  0.25541004]
Processing  AE - United Arab Emirates
[ 0.04952583 -0.00053588 -0.00732353  0.00061132]
Processed  United Arab Emirates corr :  [ 0.04952583 -0.00053588 -0.00732353  0.00061132]
Processing  AF - Afghanistan
[ 0.07250873  0.1873494   0.06244916  0.26092188]
Processed  Afghanistan corr :  [ 0.07250873  0.1873494   0.06244916  0.26092188]
Processing  AG - Algeria
[ 0.4971767   0.52476397  0.5496912   0.51797644]
Processed  Algeria corr :  [ 0.4971767   0.52476397  0.5496912   0.51797644]
Processing  AJ - Azerbaijan
[ 0.15340197  0.1522232   0.06912243  0.11087037]
Processed  Azerbaijan corr :  [ 0.15340197  0.1522232   0.06912243  0.11087037]
Processing  AL - Albania
[ 0

[-0.19317129 -0.01523386 -0.11253425  0.00430993]
Processed  Equatorial Guinea corr :  [-0.19317129 -0.01523386 -0.11253425  0.00430993]
Processing  EN - Estonia
[ 0.06615634  0.08644598  0.09215296 -0.0027769 ]
Processed  Estonia corr :  [ 0.06615634  0.08644598  0.09215296 -0.0027769 ]
Processing  ER - Eritrea
[ 0.10190961  0.07961875  0.18703867  0.20077372]
Processed  Eritrea corr :  [ 0.10190961  0.07961875  0.18703867  0.20077372]
Processing  ES - El Salvador
[-0.11344651 -0.26617261 -0.19604026 -0.27269753]
Processed  El Salvador corr :  [-0.11344651 -0.26617261 -0.19604026 -0.27269753]
Processing  ET - Ethiopia
[ 0.06870142  0.09257657  0.08140713  0.08562817]
Processed  Ethiopia corr :  [ 0.06870142  0.09257657  0.08140713  0.08562817]
Processing  EZ - Czech Republic
[ 0.06455809  0.00023994  0.08291628  0.0003863 ]
Processed  Czech Republic corr :  [ 0.06455809  0.00023994  0.08291628  0.0003863 ]
Processing  FI - Finland
[-0.06191077 -0.17086345 -0.14076717 -0.0389415 ]
Proc

[-0.00833967 -0.02134985 -0.0006723  -0.01726665]
Processed  Monaco corr :  [-0.00833967 -0.02134985 -0.0006723  -0.01726665]
Processing  MO - Morocco
[-0.53867252 -0.50986887 -0.4190237  -0.51988716]
Processed  Morocco corr :  [-0.53867252 -0.50986887 -0.4190237  -0.51988716]
Processing  MP - Mauritius
[-0.17037213 -0.08044107 -0.11493247 -0.05663029]
Processed  Mauritius corr :  [-0.17037213 -0.08044107 -0.11493247 -0.05663029]
Processing  MR - Mauritania
[ 0.25389482  0.31775454  0.20988949  0.36356617]
Processed  Mauritania corr :  [ 0.25389482  0.31775454  0.20988949  0.36356617]
Processing  MT - Malta
[ 0.02775499  0.06692454  0.00976494  0.05030192]
Processed  Malta corr :  [ 0.02775499  0.06692454  0.00976494  0.05030192]
Processing  MU - Oman
[  3.91893438e-02  -8.50145616e-05   2.90917307e-01   1.97017989e-01]
Processed  Oman corr :  [  3.91893438e-02  -8.50145616e-05   2.90917307e-01   1.97017989e-01]
Processing  MV - Maldives
[ 0.22590819  0.35038445  0.35556704  0.13229259

[ 0.01202341  0.08274156 -0.00362889 -0.01439373]
Processed  Sao Tome and Principe corr :  [ 0.01202341  0.08274156 -0.00362889 -0.01439373]
Processing  TS - Tunisia
[ 0.5431609   0.584162    0.66091514  0.44934258]
Processed  Tunisia corr :  [ 0.5431609   0.584162    0.66091514  0.44934258]
Processing  TT - Timor-Leste
[-0.06414697 -0.02735459  0.0009805  -0.08916316]
Processed  Timor-Leste corr :  [-0.06414697 -0.02735459  0.0009805  -0.08916316]
Processing  TU - Turkey
[-0.06038838 -0.05041926 -0.02621635 -0.02701476]
Processed  Turkey corr :  [-0.06038838 -0.05041926 -0.02621635 -0.02701476]
Processing  TV - Tuvalu
[-0.01980041  0.06744026 -0.01670348 -0.00463323]
Processed  Tuvalu corr :  [-0.01980041  0.06744026 -0.01670348 -0.00463323]
Processing  TX - Turkmenistan
[-0.07544349  0.00049526 -0.00135244 -0.03058747]
Processed  Turkmenistan corr :  [-0.07544349  0.00049526 -0.00135244 -0.03058747]
Processing  UG - Uganda
[-0.20201086 -0.1855492  -0.02908467 -0.08389769]
Processed  