In [1]:
pwd

'/Users/peterhaglich/Dropbox/Work/IARPA/Mercury/peterhaglich/mercury-challenge/src/ExpressScore/notebooks'

In [2]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from dateutil.parser import parse
import datetime
import calendar
import json
import os
import re

import matplotlib
import matplotlib.pyplot as plt
matplotlib.style.use("fivethirtyeight")

from collections import Counter

from geopy.distance import distance

In [4]:
EVENT_TYPE = "Military Action"
EVT_ABBR = "MA"
COUNTRY = "Egypt"

month_str = "May 2018"
month_path_str = "_".join(month_str.split(" "))

FIRST_DATE = parse("2018-05-01")
LAST_DATE = parse("2018-05-31")

MERC_CHALLENGE_HOME = os.path.abspath("../../..")

MERC_HOME = os.path.join(MERC_CHALLENGE_HOME, "..", "mercury")
EXPRESS_SCORE_PATH = os.path.join(MERC_CHALLENGE_HOME, "src", "ExpressScore")
print(os.listdir(EXPRESS_SCORE_PATH))
ES_TEST_RESOURCE_PATH = os.path.join(EXPRESS_SCORE_PATH, "resources", "test", "eg_ma_may_2018")
DATA_HOME = os.path.join(MERC_HOME, "data")
WARN_PATH = os.path.join(DATA_HOME, "baserate_warnings", "MANSA")
month_warn_path = os.path.join(WARN_PATH, month_path_str)
MANSA_GSR_PATH = os.path.join(MERC_CHALLENGE_HOME, "data", "gsr", "ma_gsr")

MAX_DIST = 100.0
DIST_BUFFER = 16.67
MAX_DATE_DIFF = 4.0

['.DS_Store', '__init__.py', '__pycache__', 'main', 'notebooks', 'README.md', 'resources', 'test']


In [5]:
br_warn_filename = "Baserate_MANSA_{}.json".format(month_path_str)
br_warn_path = os.path.join(WARN_PATH, br_warn_filename)
with open(br_warn_path, "r", encoding="utf8") as f:
    br_warn = json.load(f)
br_warn = [w for w in br_warn["payload"] if w["Event_Type"] == "Military Action"]
br_country_counts = Counter([w["Country"] for w in br_warn])
print(br_country_counts)

Counter({'Syria': 1685, 'Iraq': 529, 'Lebanon': 42, 'Egypt': 35, 'Saudi Arabia': 5, 'Yemen': 5})


In [6]:
gsr_filename = "MA_{}.json".format(month_path_str)
gsr_path = os.path.join(MANSA_GSR_PATH, gsr_filename)
with open(gsr_path, "r", encoding="utf8") as f:
    gsr = json.load(f)
ma_gsr = [e for e in gsr if e["Event_Type"] == "Military Action"]
gsr_country_counts = Counter([e["Country"] for e in ma_gsr])
print(gsr_country_counts)

Counter({'Syria': 884, 'Iraq': 439, 'Saudi Arabia': 13, 'Egypt': 10, 'Yemen': 6, 'Lebanon': 4})


In [7]:
test_evt = [e for e in ma_gsr if e["Country"] == COUNTRY][0]
test_evt

{'Actor': 'Egyptian Military;Egyptian Police',
 'Approximate_Location': 'False',
 'City': 'Muḩāfaz̧at Shamāl Sīnā’',
 'Country': 'Egypt',
 'Earliest_Reported_Date': '2018-05-10',
 'Event_Date': '2018-05-07',
 'Event_ID': 'MN268719',
 'Event_Subtype': 'Conflict',
 'Event_Type': 'Military Action',
 'First_Reported_Link': 'http://www.almasryalyoum.com/news/details/1289438',
 'GSS_Link': 'http://www.almasryalyoum.com/news/details/1289438',
 'Latitude': 30.5,
 'Longitude': 33.7,
 'News_Source': 'Almasry Alyoum',
 'Other_Links': 'http://www.almasryalyoum.com/news/details/1289770;',
 'Revision_Date': '2018-06-13',
 'State': 'Shamāl Sīnāʼ'}

In [8]:
cc_gsr = [e for e in ma_gsr if e["Country"] == COUNTRY]
print(len(cc_gsr))
for i, e in enumerate(cc_gsr):
    new_id = "MN{}".format(i)
    e["Event_ID"] = new_id
out_filename = "test_cc_gsr.json"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
with open(out_path, "w") as f:
    json.dump(cc_gsr, f, ensure_ascii=False, indent=2)

10


In [9]:
cc_warn = [w for w in br_warn if w["Country"] == COUNTRY]
cc_warn = [w for w in cc_warn if parse(w["Event_Date"]) <= LAST_DATE
           and parse(w["Event_Date"]) >= FIRST_DATE]
for w in cc_warn:
    if w["Event_Subtype"] == "Armed Conflict":
        w["Event_Subtype"] = "Conflict"
    keys_ = list(w.keys())
    for k in keys_:
        if re.findall("Target", k):
            del w[k]
print(len(cc_warn))
for i, w in enumerate(cc_warn):
    new_id = "BR_{}".format(i)
    w["Warning_ID"] = new_id
out_filename = "test_cc_warnings.json"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
with open(out_path, "w") as f:
    json.dump(cc_warn, f, ensure_ascii=False, indent=2)

35


In [10]:
cc_gsr[0]

{'Actor': 'Egyptian Military;Egyptian Police',
 'Approximate_Location': 'False',
 'City': 'Muḩāfaz̧at Shamāl Sīnā’',
 'Country': 'Egypt',
 'Earliest_Reported_Date': '2018-05-10',
 'Event_Date': '2018-05-07',
 'Event_ID': 'MN0',
 'Event_Subtype': 'Conflict',
 'Event_Type': 'Military Action',
 'First_Reported_Link': 'http://www.almasryalyoum.com/news/details/1289438',
 'GSS_Link': 'http://www.almasryalyoum.com/news/details/1289438',
 'Latitude': 30.5,
 'Longitude': 33.7,
 'News_Source': 'Almasry Alyoum',
 'Other_Links': 'http://www.almasryalyoum.com/news/details/1289770;',
 'Revision_Date': '2018-06-13',
 'State': 'Shamāl Sīnāʼ'}

In [11]:
cc_warn[0]

{'City': 'Hay’at Qanāt as Suways',
 'Country': 'Egypt',
 'Actor': 'Egyptian Police',
 'timestamp': '2018-05-24T2:53:26.0',
 'Longitude': 32.313,
 'Probability': 0.5458124337,
 'Event_Subtype': 'Conflict',
 'Event_Type': 'Military Action',
 'State': 'Muḩāfaz̧at Būr Sa‘īd',
 'Latitude': 31.2522,
 'Event_Date': '2018-05-30'}

In [12]:
def dist(warn, evt):
    w_lat = warn["Latitude"]
    w_long = warn["Longitude"]
    e_lat = evt["Latitude"]
    e_long = evt["Longitude"]
    return distance((w_lat, w_long), (e_lat, e_long)).km
def dist_to_warn(w, gsr_list):
    return [dist(w, e) for e in gsr_list]
def date_diff(warn, evt):
    w_date = parse(warn["Event_Date"])
    e_date = parse(evt["Event_Date"])
    delta = (w_date - e_date).days
    return delta
def date_diff_to_warn(w, gsr_list):
    return [date_diff(w, e) for e in gsr_list]
def es_match(warn, evt):
    w_es = warn["Event_Subtype"]
    e_es = evt["Event_Subtype"]
    return (w_es == e_es)
def es_match_to_warn(w, gsr_list):
    return [int(es_match(w, e)) for e in gsr_list]
def actor_match(warn, evt):
    e_actors = evt["Actor"].split(";")
    return (warn["Actor"] in e_actors)
def actor_match_to_warn(w, gsr_list):
    return [int(actor_match(w, e)) for e in gsr_list]
def ls(dist, approx_flag=False):
    max_dist = MAX_DIST - approx_flag*DIST_BUFFER
    dist = max(0, dist-approx_flag*DIST_BUFFER)
    return 1 - dist/max_dist
ls_vfunc = np.vectorize(ls)
def ds(date_diff):
    return 1 - abs(date_diff)/MAX_DATE_DIFF
ds_vfunc = np.vectorize(ds)

In [13]:
print(date_diff(cc_warn[0], cc_gsr[0]))
print(date_diff_to_warn(cc_warn[0], cc_gsr))

print(dist(cc_warn[0], cc_gsr[0]))

print(dist_to_warn(cc_warn[0], cc_gsr))
print(es_match(cc_warn[0], cc_gsr[0]))

print(es_match_to_warn(cc_warn[0], cc_gsr))
print(actor_match(cc_warn[0], cc_gsr[0]))
print(actor_match_to_warn(cc_warn[0], cc_gsr))

23
[23, 20, 15, 15, 15, 15, 16, 23, 23, 2]
156.67169537088733
[156.67169537088733, 568.5091388686368, 156.67169537088733, 142.20606062483296, 156.67169537088733, 156.67169537088733, 156.67169537088733, 156.67169537088733, 156.67169537088733, 197.18227373911515]
True
[1, 0, 1, 1, 1, 1, 0, 0, 0, 1]
True
[1, 1, 0, 0, 0, 0, 0, 1, 0, 0]


In [14]:
print(cc_warn[0]["Event_Subtype"], cc_gsr[0]["Event_Subtype"], cc_gsr[1]["Event_Subtype"])

Conflict Conflict Force Posture


In [15]:
dist_array = [dist_to_warn(w, cc_gsr) for w in cc_warn]
dist_array = np.array(dist_array).reshape(len(cc_warn), len(cc_gsr))

dist_df = pd.DataFrame(dist_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

dist_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,156.671695,568.509139,156.671695,142.206061,156.671695,156.671695,156.671695,156.671695,156.671695,197.182274
BR_1,245.187081,429.623344,245.187081,281.529367,245.187081,245.187081,245.187081,245.187081,245.187081,35.434948
BR_2,254.486546,453.1969,254.486546,285.1704,254.486546,254.486546,254.486546,254.486546,254.486546,20.95536
BR_3,248.128067,552.115757,248.128067,251.812768,248.128067,248.128067,248.128067,248.128067,248.128067,116.05372
BR_4,70.653347,569.764664,70.653347,0.0,70.653347,70.653347,70.653347,70.653347,70.653347,306.124936
BR_5,70.653347,569.764664,70.653347,0.0,70.653347,70.653347,70.653347,70.653347,70.653347,306.124936
BR_6,0.0,499.51762,0.0,70.653347,0.0,0.0,0.0,0.0,0.0,274.858132
BR_7,0.0,499.51762,0.0,70.653347,0.0,0.0,0.0,0.0,0.0,274.858132
BR_8,183.552847,511.368029,183.552847,196.921849,183.552847,183.552847,183.552847,183.552847,183.552847,119.130765
BR_9,0.0,499.51762,0.0,70.653347,0.0,0.0,0.0,0.0,0.0,274.858132


In [16]:
out_filename = "test_cc_dist_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
dist_df.to_csv(out_path)

In [17]:
is_approx_list = [eval(e["Approximate_Location"]) for e in cc_gsr]
is_approx_array = np.array(is_approx_list*len(cc_warn)).reshape(len(cc_warn), len(cc_gsr))
is_approx_df = pd.DataFrame(is_approx_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

is_approx_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,False,True,True,False,False,False,False,False,False,True
BR_1,False,True,True,False,False,False,False,False,False,True
BR_2,False,True,True,False,False,False,False,False,False,True
BR_3,False,True,True,False,False,False,False,False,False,True
BR_4,False,True,True,False,False,False,False,False,False,True
BR_5,False,True,True,False,False,False,False,False,False,True
BR_6,False,True,True,False,False,False,False,False,False,True
BR_7,False,True,True,False,False,False,False,False,False,True
BR_8,False,True,True,False,False,False,False,False,False,True
BR_9,False,True,True,False,False,False,False,False,False,True


In [18]:
out_filename = "test_approx_location_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
is_approx_df.to_csv(out_path)

In [19]:
max_dist_array = 100*np.ones(shape=(len(cc_warn), len(cc_gsr)))
ls_dist_array = np.minimum(dist_array, max_dist_array)
ls_array = ls_vfunc(ls_dist_array, is_approx_array)
ls_df = pd.DataFrame(ls_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

ls_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.774812
BR_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.948574
BR_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_4,0.293467,0.0,0.352174,1.0,0.293467,0.293467,0.293467,0.293467,0.293467,0.0
BR_5,0.293467,0.0,0.352174,1.0,0.293467,0.293467,0.293467,0.293467,0.293467,0.0
BR_6,1.0,0.0,1.0,0.293467,1.0,1.0,1.0,1.0,1.0,0.0
BR_7,1.0,0.0,1.0,0.293467,1.0,1.0,1.0,1.0,1.0,0.0
BR_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_9,1.0,0.0,1.0,0.293467,1.0,1.0,1.0,1.0,1.0,0.0


In [20]:
out_filename = "test_ls_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
ls_df.to_csv(out_path)

In [21]:
date_diff_array = [date_diff_to_warn(w, cc_gsr) for w in cc_warn]
date_diff_array = np.array(date_diff_array).reshape(len(cc_warn), len(cc_gsr))

date_diff_df = pd.DataFrame(date_diff_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

date_diff_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,23,20,15,15,15,15,16,23,23,2
BR_1,5,2,-3,-3,-3,-3,-2,5,5,-16
BR_2,5,2,-3,-3,-3,-3,-2,5,5,-16
BR_3,2,-1,-6,-6,-6,-6,-5,2,2,-19
BR_4,-1,-4,-9,-9,-9,-9,-8,-1,-1,-22
BR_5,-6,-9,-14,-14,-14,-14,-13,-6,-6,-27
BR_6,-4,-7,-12,-12,-12,-12,-11,-4,-4,-25
BR_7,-2,-5,-10,-10,-10,-10,-9,-2,-2,-23
BR_8,0,-3,-8,-8,-8,-8,-7,0,0,-21
BR_9,16,13,8,8,8,8,9,16,16,-5


In [22]:
out_filename = "test_cc_date_diff_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
date_diff_df.to_csv(out_path)

In [23]:
date_diff_array = np.abs(date_diff_array)
max_dd_array = 4*np.ones(shape=(len(cc_warn), len(cc_gsr)))
min_dd_array = np.zeros(shape=(len(cc_warn), len(cc_gsr)))
ds_dd_array = np.minimum(date_diff_array, max_dd_array)

ds_dd_array = np.maximum(ds_dd_array, min_dd_array)

ds_array = ds_vfunc(ds_dd_array)
ds_df = pd.DataFrame(ds_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

ds_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
BR_1,0.0,0.5,0.25,0.25,0.25,0.25,0.5,0.0,0.0,0.0
BR_2,0.0,0.5,0.25,0.25,0.25,0.25,0.5,0.0,0.0,0.0
BR_3,0.5,0.75,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0
BR_4,0.75,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.75,0.0
BR_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_7,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.5,0.0
BR_8,1.0,0.25,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
BR_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
out_filename = "test_ds_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
ds_df.to_csv(out_path)

In [25]:
es_array = [es_match_to_warn(w, cc_gsr) for w in cc_warn]
es_array = np.array(es_array).reshape(len(cc_warn), len(cc_gsr))

es_df = pd.DataFrame(es_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

es_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,1,0,1,1,1,1,0,0,0,1
BR_1,1,0,1,1,1,1,0,0,0,1
BR_2,0,1,0,0,0,0,1,1,1,0
BR_3,1,0,1,1,1,1,0,0,0,1
BR_4,1,0,1,1,1,1,0,0,0,1
BR_5,0,1,0,0,0,0,1,1,1,0
BR_6,1,0,1,1,1,1,0,0,0,1
BR_7,1,0,1,1,1,1,0,0,0,1
BR_8,1,0,1,1,1,1,0,0,0,1
BR_9,1,0,1,1,1,1,0,0,0,1


In [26]:
out_filename = "test_es_match_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
es_df.to_csv(out_path)

In [27]:
acs_array = [actor_match_to_warn(w, cc_gsr) for w in cc_warn]
acs_array = np.array(acs_array).reshape(len(cc_warn), len(cc_gsr))
acs_df = pd.DataFrame(acs_array,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

acs_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,1,1,0,0,0,0,0,1,0,0
BR_1,1,0,1,1,1,1,1,1,1,1
BR_2,1,0,1,1,1,1,1,1,1,1
BR_3,1,0,1,1,1,1,1,1,1,1
BR_4,1,1,0,0,0,0,0,1,0,0
BR_5,1,0,1,1,1,1,1,1,1,1
BR_6,1,1,0,0,0,0,0,1,0,0
BR_7,1,1,0,0,0,0,0,1,0,0
BR_8,1,1,0,0,0,0,0,1,0,0
BR_9,1,0,1,1,1,1,1,1,1,1


In [28]:
out_filename = "test_actor_match_matrix.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
acs_df.to_csv(out_path)

In [29]:
qs_mat = ls_array + ds_array + es_array + acs_array
qs_mat

array([[2.        , 1.        , 1.        , 1.        , 1.        ,
        1.        , 0.        , 1.        , 0.        , 1.5       ],
       [2.        , 0.5       , 2.25      , 2.25      , 2.25      ,
        2.25      , 1.5       , 1.        , 1.        , 2.77481161],
       [1.        , 1.5       , 1.25      , 1.25      , 1.25      ,
        1.25      , 2.5       , 2.        , 2.        , 1.94857363],
       [2.5       , 0.75      , 2.        , 2.        , 2.        ,
        2.        , 1.        , 1.5       , 1.5       , 2.        ],
       [3.04346653, 1.        , 1.35217393, 2.        , 1.29346653,
        1.29346653, 0.29346653, 2.04346653, 1.04346653, 1.        ],
       [1.29346653, 1.        , 1.35217393, 2.        , 1.29346653,
        1.29346653, 2.29346653, 2.29346653, 2.29346653, 1.        ],
       [3.        , 1.        , 2.        , 1.29346653, 2.        ,
        2.        , 1.        , 2.        , 1.        , 1.        ],
       [3.5       , 1.        , 2.       

In [30]:
qs_mat[ls_array == 0] = 0
qs_mat[ds_array == 0] = 0
qs_df = pd.DataFrame(qs_mat,
                       index = [w["Warning_ID"] for w in cc_warn],
                       columns = [e["Event_ID"] for e in cc_gsr])

qs_df

Unnamed: 0,MN0,MN1,MN2,MN3,MN4,MN5,MN6,MN7,MN8,MN9
BR_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_4,3.043467,0.0,0.0,0.0,0.0,0.0,0.0,2.043467,1.043467,0.0
BR_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_7,3.5,0.0,0.0,0.0,0.0,0.0,0.0,2.5,1.5,0.0
BR_8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BR_9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
out_filename = "test_qs_mat.csv"
out_path = os.path.join(ES_TEST_RESOURCE_PATH, out_filename)
qs_df.to_csv(out_path)