In [1]:
# Add more data, now have lat, lon, and area of each cell
# Merge WFP Price data w/ shapefile
# Rasterize the shapefile
# Logistic regression on whether an area experiences a price spike
# Use NDVI and SPI in an area... maybe trade data? macroeconomic indicators? crop calendars?

# Add a masked layer using Logistics Cluster Global Obstacles data

# File handling, miscellaneous libraries
import os
import pickle
from datetime import date
import zipfile
import json
import itertools

# Data handling libraries
import pandas as pd
pd.options.display.max_columns = 1000
pd.options.display.max_rows = 100
import numpy as np

# External data access & configuration
from hdx.hdx_configuration import Configuration
from hdx.data.dataset import Dataset
try:
    Configuration.create(hdx_site="test", hdx_read_only=True)
except:
    print("HDX already configured")
    
import requests as req

# Analysis libraries
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

# Adding data to Carto
from configparser import ConfigParser

config = ConfigParser()
config.read("../.env")
# FROM: https://resourcewatch.carto.com/u/wri-rw/your_apps
carto_api_token = config.get("auth", "carto_api_token")
rw_api_token = config.get("auth", "rw_api_token")

# Libraries for uploading data to S3
import boto3
import sys
import threading

s3_upload = boto3.client("s3")
s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/"

In [None]:
### Load data from Humanitarian Data Exchange ###
print("Started")
datasets = Dataset.search_in_hdx('Food and Commodity WFP', rows=10)
resource = Dataset.get_resources(datasets[0])
url, path = resource[0].download()

wfp_food_price_data = pd.read_csv(path,encoding = "ISO-8859-1")
df = wfp_food_price_data

print("Data loaded")

In [None]:
### Helper functions ###

# can be used for the toordinal function
def create_ordinal_date_column(date_tuple):
    return(date.toordinal(date(*date_tuple)))

def add_dummy_columns(df, src_col):
    """Create additional columns on df as dummy variables 
        for values in src_col (i.e. months) """
    
    dummy_data = df.loc[:,(src_col)]
    dummy_vals = dummy_data.unique()
    
    for val in dummy_vals:
        dummy = (dummy_data == val).astype(int)
        df.loc[:,("month_"+str(val))] = dummy
    
    return(df)

def sanity_check(df):
    # TO DO: check whether there are at least 3 records for each month
    months = df["mp_month"]
    #print(months)
    ct_months = np.zeros(12)
    for month in months:
        ct_months[month-1] +=1
        
    #print(ct_months)
    for ct in ct_months:
        if ct < 3:
            return(False)
    return(True)

def calc_alps(dev):
    if(dev < .25):
        return("white")
    elif(dev < 1):
        return("yellow")
    elif(dev < 2):
        return("orange")
    else:
        return("red")

In [None]:
# List all unique markets, commodities
mkt_names = df.loc[:,"mkt_name"].unique()
print(mkt_names)
commodities = {}
for mkt in mkt_names:
    commodities[mkt] = df.loc[df.loc[:,"mkt_name"]==mkt, "cm_name"].unique()

print("Markets and Commodities Set")

for mkt in mkt_names:
    for cmdty in commodities[mkt]:
        print("Market:", mkt, ", Commodity:", cmdty)
        selection = (df["mkt_name"]==mkt) & (df["cm_name"]==cmdty)
        
        price_history = df.loc[(selection), :]
        
        # Sanity check - at least 3 years of data for each month?
        if(not sanity_check(price_history)):
            print("Not enough raw training data")
            continue
        
        # Create ordinal date column
        date_nums = list(zip(price_history["mp_year"], price_history["mp_month"], np.ones(price_history.shape[0]).astype(int)))
        ordinal_dates = list(map(create_ordinal_date_column, date_nums))
        price_history.loc[:,("ordinal_dates")] = ordinal_dates
        
        # Create dummy columns for each month
        price_history = add_dummy_columns(price_history, "mp_month")
        
        # Create training and label data
        training_cols = ["ordinal_dates", "month_1", "month_2", "month_3",
                        "month_4", "month_5", "month_6",
                        "month_7", "month_8", "month_9",
                        "month_10", "month_11", "month_12"]
        X = price_history.loc[:, training_cols]
        Y = price_history.loc[:,"mp_price"]
        
        lm = linear_model.LinearRegression()
        lm.fit(X, Y)

        #print(lm.coef_)
        #print(lm.intercept_)
        
        # Calculate model residuals
        Y_hat = lm.predict(X)
        residuals = Y - Y_hat
        #print(residuals)
        
        # Divide by standard deviation of residuals
        resid_std_dev = np.sqrt(mean_squared_error(Y, Y_hat))
        ## ^ how to find from lm object?
        std_devs = residuals / resid_std_dev
        
        # Retrain model without first pass outliers
        price_history_tame = price_history.loc[(std_devs > -1) & (std_devs < 1), :]
        
        if(not sanity_check(price_history_tame)):
            print("Not enough tame training data")
            continue
        
        X_tame = price_history.loc[:,training_cols]
        Y_tame = price_history.loc[:,("mp_price")]
                
        lm_tame = linear_model.LinearRegression()
        lm_tame.fit(X_tame, Y_tame)
        
        Y_hat_tame = lm_tame.predict(X)
        residuals_tame = Y - Y_hat_tame
        #print(residuals_tame)
        
        # Divide by standard error of estimatation
        resid_std_dev_tame = np.sqrt(mean_squared_error(Y, Y_hat_tame))
        ## ^ how to find from lm object?
        std_devs_tame = residuals_tame / resid_std_dev_tame
        
        # Calculate ALPS
        ALPS = list(map(calc_alps, std_devs_tame))
        print("setting ALPS on current selection")
        df.loc[selection, "ALPS"] = ALPS
        print(type(list(lm_tame.coef_)))
        df.loc[selection, "Fitted Model"] = lm
        df.loc[selection, "Fitted Model Tame"] = lm_tame
        df.loc[selection, "Model Residual"] = std_devs_tame
        df.loc[selection, "Model Std Error of Residuals"] = resid_std_dev_tame

### Export results to pickle ###
df.to_pickle("/Users/nathansuberi/Desktop/RW_Data/wfp_alps.pkl")

In [None]:
### Merge WFP food price data from hdx with point locations from FAO GIEWS ###

In [2]:
# Load pickle, rename for easy typing
wfp_data_with_alps = pd.read_pickle("/Users/nathansuberi/Desktop/RW_Data/wfp_alps.pkl")
df = wfp_data_with_alps

### Possible speedup - create a dictionary of adm0 and adm1 names for each market
# This could avoid a full join in the wfp_alps_summary_by_year block

# Group by market name, price spike level, and year
group_df = df.groupby(["mkt_name", "ALPS", "mp_year", "adm0_name", "adm1_name"])
wfp_alps_summary_by_year = group_df.count().pivot_table(values="adm0_id", index=["mp_year", "mkt_name", "adm0_name", "adm1_name"], columns=["ALPS"])

# Pivot to have a row for each market in each year. Values set to the adm0_id count...
# This represents a count of each ALPS level for the market in a given year
wfp_alps_summary_by_year = wfp_alps_summary_by_year.reset_index()
keep_cols = ['adm0_name', 'adm1_name','mkt_name', 'mp_year',
             'orange', 'red', 'white', 'yellow']
wfp_alps_summary_by_year = wfp_alps_summary_by_year[keep_cols]

# Fetch FAO data to join with it
res = req.get("http://www.fao.org/giews/food-prices/tool/api/v1/series/fao-domestic")
data = res.json()
mkt_df = pd.DataFrame(data)
mkt_df = mkt_df[["market", "marketId", "marketInfo", "marketLatitude", "marketLongitude"]]
mkt_df = mkt_df.drop_duplicates()

# Merge the two
wfp_alps_summary_with_points = wfp_alps_summary_by_year.merge(mkt_df,
                               left_on = "mkt_name",
                               right_on = "market",
                               how="left")
df = wfp_alps_summary_with_points 

# Keep only those records that have market lat longs
wfp_data_with_points_geocoded = df[pd.notnull(df["market"])]

keep_cols = ['adm0_name', 'adm1_name','market','mp_year',
             'marketLatitude','marketLongitude','marketInfo',
            'orange','red','white','yellow']

final_data = wfp_data_with_points_geocoded[keep_cols]
final_data.columns = ['adm0_name', 'adm1_name','market','mp_year',
             'latitude','longitude','marketInfo',
            'orange','red','white','yellow']

### Important! Must rename to "latitude" and "longitude" for auto-georeferencing

In [3]:
final_data.head(100)

Unnamed: 0,adm0_name,adm1_name,market,mp_year,latitude,longitude,marketInfo,orange,red,white,yellow
3,Burkina Faso,Sahel,Dori,1992,14.037,-0.05,Located in the northern Sahelian region. Food ...,,,12.0,
6,Burkina Faso,Centre-nord,Kongoussi,1992,13.333,-1.533,,,,12.0,
9,Burkina Faso,Centre-est,Tenkodogo,1992,11.783,-0.367,,,,10.0,2.0
14,Burkina Faso,Sahel,Dori,1993,14.037,-0.05,Located in the northern Sahelian region. Food ...,,,12.0,
17,Burkina Faso,Centre-nord,Kongoussi,1993,13.333,-1.533,,,,12.0,
20,Burkina Faso,Centre-est,Tenkodogo,1993,11.783,-0.367,,,,12.0,
34,India,Tamil Nadu,Chennai,1994,13.06,80.25,Important rice producing area. Wheat is not pr...,20.0,,1.0,23.0
41,Burkina Faso,Sahel,Dori,1994,14.037,-0.05,Located in the northern Sahelian region. Food ...,,,12.0,
56,Burkina Faso,Centre-nord,Kongoussi,1994,13.333,-1.533,,,,12.0,
63,India,Maharashtra,Mumbai,1994,19.018,72.856,Main urban food deficit area in the country. L...,18.0,1.0,11.0,18.0


In [None]:
# Push to S3 and then have Carto sync from there
zip_local_name = '/Users/nathansuberi/Desktop/RW_Data/com_008_wfp_alerts_price_spikes.zip'
zip_s3_name = s3_folder + "com_008_wfp_alerts_price_spikes.zip"
with zipfile.ZipFile(zip_local_name, 'w') as csv_zip:
    csv_zip.writestr("com_008_wfp_alerts_price_spikes.csv", final_data.to_csv())

# Upload to S3
s3_upload.upload_file(zip_local_name, s3_bucket, zip_s3_name)

In [None]:
### ONLY NEED TO RUN THIS ONCE TO SET UP THE SYNC ###

alps_data_url = "https://wri-public-data.s3.amazonaws.com/resourcewatch/com_008_wfp_alerts_price_spikes.zip"

# 3600 = sync every hour
# 3600 * 24 * 30 = sync every week
interval = str(3600*24*7)

alps_payload = {
    "url":alps_data_url,
    "interval":interval
}

sync_url = "https://wri-rw.carto.com/api/v1/synchronizations/?api_key={}".format(carto_api_token)
headers = {
    'content-type': "application/json"
}

alps_res = req.request("POST", sync_url, data=json.dumps(alps_payload), headers = headers)
print("alps:", alps_res.text)


In [4]:
# Base URL for getting dataset metadata from RW API
# Metadata = Data that describes Data 
url = "https://api.resourcewatch.org/v1/dataset?sort=slug,-provider,userId&status=saved&includes=metadata,vocabulary,widget,layer"

# page[size] tells the API the maximum number of results to send back
# There are currently between 200 and 300 datasets on the RW API
payload = { "application":"rw", "page[size]": 1000}

# Request all datasets, and extract the data from the response
res = req.get(url, params=payload)
data = res.json()["data"]

#############################################################

### Convert the json object returned by the API into a pandas DataFrame
# Another option: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.io.json.json_normalize.html
datasets_on_api = {}

for ix, dset in enumerate(data):

    atts = dset["attributes"]
    metadata = atts["metadata"]
    layers = atts["layer"]
    widgets = atts["widget"]
    tags = atts["vocabulary"]
    datasets_on_api[atts["name"]] = {
        "rw_id":dset["id"],
        "upload_name":atts["name"],
        "table_name":atts["tableName"],
        "provider":atts["provider"],
        "date_updated":atts["updatedAt"],
        "num_metadata_keys":len(metadata),
        "metadata": metadata,
        "num_layers":len(layers),
        "layers": layers,
        "num_widgets":len(widgets),
        "widgets": widgets,
        "num_tags":len(tags),
        "tags":tags
    }    
    
# Create the DataFrame, name the index, and sort by date_updated
# More recently updated datasets at the top
current_datasets_on_api = pd.DataFrame.from_dict(datasets_on_api, orient='index')

def check_public_title(metadata):
    if len(metadata) > 0:
        mdata = metadata[0]
        if "attributes" in mdata:
            if "info" in mdata["attributes"]:
                if "name" in mdata["attributes"]["info"]:
                    return(mdata["attributes"]["info"]["name"])
        return(None)

# Grab public title, if it exists in metadata
current_datasets_on_api["public_title"] = current_datasets_on_api.apply(lambda row: check_public_title(row["metadata"]), axis=1)

current_datasets_on_api.set_index("rw_id", inplace=True)
current_datasets_on_api.index.rename("Dataset", inplace=True)
current_datasets_on_api.sort_values(by=["date_updated"], inplace=True, ascending = False)

In [5]:
current_datasets_on_api.loc["e4bdc4c9-96a3-4f0b-8d8f-1742cabd8f80", "layers"]

[{'attributes': {'application': ['rw'],
   'applicationConfig': {},
   'dataset': 'e4bdc4c9-96a3-4f0b-8d8f-1742cabd8f80',
   'default': False,
   'description': 'This shows the number of markets at alert level orange from the WFP, in the year 2014',
   'env': 'production',
   'interactionConfig': {},
   'iso': [''],
   'layerConfig': {'account': 'wri-rw',
    'body': {'layers': [{'options': {'cartocss': '#com_008_wfp_alerts_price_spikes{marker-width:3; marker-opacity:1; marker-fill:#FFA500; marker-allow-overlap: true;[orange>=20]{marker-width:15;} [orange>=15][orange<20]{marker-width:12;}[orange>=10][orange<15]{marker-width:7;} [orange>=5][orange<10]{marker-width:4;}[orange<5]{marker-width:3;}}',
        'cartocss_version': '2.3.0',
        'sql': 'SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=2014'},
       'type': 'cartodb'}],
     'maxzoom': 18,
     'minzoom': 3}},
   'legendConfig': {'items': [{'color': '#FFA500',
      'name': 'WFP markets at alert level orange in 2

In [6]:
# Create list of SQL statements by year and spike level
## print(final_data.columns)
## markets = list(final_data["market"].unique())

table_name = "com_008_wfp_alerts_price_spikes"

years = list(final_data["mp_year"].unique().astype(str))
alert_levels = ["orange", "red", "white", "yellow"]
params_for_SQL = itertools.product(years, alert_levels)

list_of_SQL = [["SELECT * FROM " + table_name + " WHERE mp_year="+params[0], params]   for params in params_for_SQL]
list_of_SQL

[['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1992',
  ('1992', 'orange')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1992',
  ('1992', 'red')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1992',
  ('1992', 'white')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1992',
  ('1992', 'yellow')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1993',
  ('1993', 'orange')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1993',
  ('1993', 'red')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1993',
  ('1993', 'white')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1993',
  ('1993', 'yellow')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1994',
  ('1994', 'orange')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1994',
  ('1994', 'red')],
 ['SELECT * FROM com_008_wfp_alerts_price_spikes WHERE mp_year=1994',
  ('1994', 

In [None]:
### Create the layer definitions, by year

# Make sure to include the year in the layer_config option
# Write out all the SQL queries, store in an object on the layer config... duplicate
# This among all the 

layer_defs = []

datasetID = "e4bdc4c9-96a3-4f0b-8d8f-1742cabd8f80"

for ix, sql in enumerate(list_of_SQL):
    
    alert_year = sql[1][0]
    alert_level = sql[1][1]
    
    alert_color = {
        "yellow":"#FFFF00",
        "red":"#FF0000",
        "white":"#FFFFFF",
        "orange":"#FFA500"
    }
    
    title = "Alerts for Price Spikes - level "+alert_level+" in "+alert_year
    provider = "cartodb"
    description = "This shows the number of markets at alert level "+alert_level+" from the WFP, in the year "+alert_year+\
    ". The current ALPS value for a food commodity is the number of standard deviations the current price is above the "+\
    "commodity's expected price for that month. Based on this ALPS value, markets are assigned to 1 of 4 situations, where "+\
    "all nominal values are measured in standard deviations from the seasonal mean price: Normal (white), ALPS < 0.25 Stress "+\
    "(yellow), 0.25 < ALPS < 1 Alert (orange), 1 < ALPS < 2 Crisis (red), ALPS >= 2."
    # Is this the dataset or layer slug?
    slug = ''

    # Do I need to set these?
    layer_id = ''
    user_id = ''
    updated_at = ''
    iso_var = ['']

    layer_type = "cartodb" # or mapnik?
    table_name = "com_008_wfp_alerts_price_spikes"

    layer_config = {
      "body": {
        "layers": [
          {
            "options": {
              "cartocss_version": "2.3.0",
              "cartocss": "#"+table_name+\
              "{marker-width:3; marker-opacity:1; marker-fill:"+alert_color[alert_level]+"; marker-allow-overlap: true;"+\
                "["+alert_level+">=20]{marker-width:15;} ["+alert_level+">=15]["+alert_level+"<20]{marker-width:12;}"+\
                "["+alert_level+">=10]["+alert_level+"<15]{marker-width:7;} ["+alert_level+">=5]["+alert_level+"<10]{marker-width:4;}"
                "["+alert_level+"<5]{marker-width:3;}}",
              "sql": sql[0]
            },
            "type": layer_type
          }
        ],
        "minzoom": 3,
        "maxzoom": 18
      },
      "account": "wri-rw"
    }

    legend_config = {
      "items": [
        {
          "color": alert_color[alert_level],
          "name": "WFP markets at alert level "+alert_level+" in "+alert_year
        }
      ],
      "type": "basic"
    }

    interaction_config = {}

    # {'attributes': 
    layer_object = {'application': ['rw'],
       'applicationConfig': {},
       'dataset': datasetID,
       'default': True if ix==(len(list_of_SQL)-1) else False,
       'description': description,
       'interactionConfig': interaction_config,
       'iso': iso_var,
       'layerConfig': layer_config,
       'legendConfig': legend_config,
       'name': title,
       'provider': provider,
       'slug': slug,
       'staticImageConfig': {},
       'updatedAt': updated_at,
       'userId': user_id}
#       'id': layer_id,
#       'type': 'layer'}
    
    layer_defs.append(layer_object)


In [None]:
num_years_to_display = 4

for layer_def in layer_defs[-(num_years_to_display*4):]:
    layer_update_url = "https://api.resourcewatch.org/v1/dataset/"+str(datasetID)+"/layer"
    #print(layer_update_url)
    #print(layer_def)
    headers = {
        'content-type': "application/json",
        'authorization': "Bearer " + rw_api_token,
    }

    res = req.request("POST", layer_update_url, data=json.dumps(layer_def), headers = headers)
    print(res.text)

In [None]:
req.get("https://api.resourcewatch.org/v1/dataset/")

In [None]:
### Experimentation ###

In [None]:
df[df["Model Std Error of Residuals"] > 10000]

In [None]:
df["mkt_id"].unique().shape

In [None]:
wfp_data_with_points_geocoded["mkt_name"].unique().shape

In [None]:
wfp_data_with_points_geocoded["mkt_name"]

In [None]:
adm1_names = df["adm1_name"].unique()

In [None]:
### Matching with GADM files ###

import requests as req

sql = "SELECT * FROM gadm28_adm1"
limit = 9000
sql = sql.format(limit)

rw_id = "098b33df-6871-4e53-a5ff-b56a7d989f9a"
query_base = "https://api.resourcewatch.org/v1/query/{}?sql={}"
query2 = query_base.format(rw_id, sql)

payload = { "application":"rw", "page[size]": 10000000}
res2 = req.get(query2, params=payload)

In [None]:
import fiona
# import geopandas as gpd
# Use geopandas or fiona to merge ALPS data with a shapefile - try GADM

In [None]:
file = "/Users/nathansuberi/Desktop/RW_Data/gadm28/gadm28.shp"
adm1_features = {}
with fiona.open(file) as src:
    print(len(src))
    for i, feature in enumerate(src):
        if i%10000==0:
            print(i)
        name = feature["properties"]["NAME_1"] 
        if(name in adm1_names):
            if(name not in adm1_features):
                adm1_features[name] = feature["geometry"]

In [None]:
adm1_features

In [None]:
import numpy as np
group_df = wfp_data_with_points_geocoded.groupby(["market", "ALPS", "mp_year"])
print(group_df.groups)
#group_df.groups.keys()

#group_df.groups

In [None]:
group_df.get_group(('Kabul', 'yellow', 2014))

In [None]:
pd.options.display.max_rows = 10000
summed_data = group_df.count().pivot_table(values="adm0_id", index=["mp_year", "market"], columns=["ALPS"])
#summed_data = summed_data.reset_index()
summed_data

In [None]:
keep_cols = ["orange","red","white","yellow","market", "mp_year", "marketLatitude", "marketLongitude"]



In [None]:
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

grouped = df.groupby(['A', 'B'])
grouped.groups

In [None]:
red_alerts_fayzabad = df.loc[(df["ALPS"]=="red") & (df["mkt_name"]=="Fayzabad") , ["adm0_name", "adm1_name", "mkt_name", "cm_name", "cur_name", "um_name", "mp_month", "mp_year", "mp_price"]]
print(red_alerts.head(10))

red_alerts_fayzabad.to_csv("/Users/nathansuberi/Desktop/RW_Data/wfp_red_alerts_fayzabad.csv")

In [None]:
not_represented = [key for key in adm1_names if key not in adm1_features.keys()]
print(len(not_represented)/len(adm1_names))

In [None]:
import rasterio as rio

# Use rasterio to rasterize the vector, print to a geotiff

In [None]:
red_alerts_fayzabad = df.loc[(df["ALPS"]=="red") & (df["mkt_name"]=="Fayzabad") , ["adm0_name", "adm1_name", "mkt_name", "cm_name", "cur_name", "um_name", "mp_month", "mp_year", "mp_price"]]
red_alerts_fayzabad.head(10)

In [None]:
tuples = list(zip(red_alerts_fayzabad["mp_year"], red_alerts_fayzabad["mp_month"]))
multi_index = pd.MultiIndex.from_tuples(tuples, names=["Year", "Month"])
red_alerts_fayzabad.index = multi_index
red_alerts_fayzabad.loc[(2008, 5)]

In [None]:
multi_index = pd.MultiIndex.from_arrays([red_alerts_fayzabad["mp_year"], red_alerts_fayzabad["mp_month"]], names=["Year", "Month"])
red_alerts_fayzabad.index = multi_index
red_alerts_fayzabad
