Option 1: 
    * Regress against 2015 data (or end of summary year, whenever possible)
    
Option 2:
    * Regress against change over same summary period
    * Theory - this is observing structural shifts in economy (Material Flow and extractive activities) 
    ... or changes in political economy (World Bank)

Assumptions
* Country columns share the same georeferencing

In [7]:
import os
os.environ



# Import Libraries

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 200

import requests as req
import json
import boto3
import io

import sys
import logging
import os
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
import random

from functools import reduce
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import train_test_split

import cartoframes

import matplotlib.pyplot as plt
%matplotlib inline

# Authenticating to Carto

In [2]:
CARTO_USER = 'wri-rw'#os.environ.get('CARTO_USER')
CARTO_KEY = ''#os.environ.get('CARTO_KEY')

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Authenticating to S3

In [None]:
aws_access_key_id = #os.environ.get('aws_access_key_id')
aws_secret_access_key = #os.environ.get('aws_secret_access_key')

s3_bucket = "wri-public-data"
s3_folder = "resourcewatch/wide_to_long/"

s3_client = boto3.client(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)
s3_resource = boto3.resource(
    's3',
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

# Functions for reading and uploading data to/from S3
def read_from_S3(bucket, key, index_col=0):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), index_col=[index_col], encoding="utf8")
    return(df)

def write_to_S3(df, bucket, key):
    csv_buffer = io.StringIO()
    # Need to set encoding in Python2... default of 'ascii' fails
    df.to_csv(csv_buffer, encoding='utf-8')
    s3_resource.Object(bucket, key).put(Body=csv_buffer.getvalue())

# Load data to run regressions with

In [85]:
DATAX = pd.read_csv('/Users/nathansuberi/Documents/GitHub/nsuberi.github.io/Compass Degrees for Summary Period.csv')
DATAY = cc.read('com_009_flowmfa')
PROD_NAMES = cc.read('com_009_mfa13').set_index('v1')
FLOW_NAMES = cc.read('com_009_flow').set_index('flow')

In [13]:
datax.head(1)

Unnamed: 0.1,Unnamed: 0,country,prod_degree,cons_degree
0,58,FRA,89.199677,81.61690930746265


# Helper functions

In [89]:
def extract_unique(df, col):
    return df[col].unique()

def run_linear_regressions(datax, xyear, xval_col, xcountry_col, xname,
                           datay, ystartyear, yendyear,
                           yprod_col, yflow_col,
                           yyear_col, yval_col, 
                           ycountry_col,
                           test_size):
    ''' 
    Inputs: Data and needed column names
    Outputs: square matrix of regression coefficients for each indicator
    '''
    data = datay.copy()
    
    # Only look at comparisons of traded products
    data = data[pd.notnull(data[yprod_col])]
        
    # Create lists of countries, products, and flows to loop over
    all_countries, all_products, all_flows = [extract_unique(data, col) for col in [ycountry_col, yprod_col, yflow_col]]
    logging.debug('all_countries: {}'.format(all_countries))
    logging.debug('all_products: {}'.format(all_products))
    logging.debug('all_flows: {}'.format(all_flows))
    
    # Result will be an upper right triangular square matrix in 4 dimensions
    results = {}    
    for prod_y in all_products:
        for flow_y in all_flows:
            # Extract data
            # TO DO: allow for year ranges
            data_x = datax.copy()
                
            logging.debug('flow y: {}'.format(flow_y))
            logging.debug('prod y: {}'.format(prod_y))

            msg = "regressing GHG-GDP Divergence Index against {flow_y} of {prod_y}"
            msg = msg.format(flow_y = flow_y,
                       prod_y = prod_y)

            logging.info(msg)

            data_y_start = data.loc[(data[yyear_col] == ystartyear) & (data[yprod_col]==prod_y) & (data[yflow_col]==flow_y)]
            data_y_start = data_y_start.set_index(ycountry_col)
            # Avoid division by 0
            data_y_start = data_y_start.loc[data_y_start[yval_col] > 0]
            
            data_y_end = data.loc[(data[yyear_col] == yendyear) & (data[yprod_col]==prod_y) & (data[yflow_col]==flow_y)]
            data_y_end = data_y_end.set_index(ycountry_col)
            
            logging.debug('Start {}'.format(data_y_start.head()))
            logging.debug('End {}'.format(data_y_end.head()))
            
            data_y = data_y_end[yval_col].div(data_y_start[yval_col])
            data_y = data_y[pd.notnull(data_y)]
            logging.debug('Change percent: {}'.format(data_y.head()))
            
            # Throw away all but intersection of countries
            logging.debug('data_x countries: {}'.format(set(data_x[xcountry_col])))
            logging.debug('data_y countries: {}'.format(set(data_y.index)))
            keep_countries = set(data_x[xcountry_col]) & set(data_y.index)
            skipped_countries = [country for country in all_countries if country not in keep_countries]

            data_x = data_x.set_index(xcountry_col).loc[keep_countries, xval_col]
            data_y = data_y.loc[keep_countries]

            # Reshape for regression
            data_x = data_x.values.reshape(-1, 1)
            data_y = data_y.values.reshape(-1, 1)
            
            if (len(data_x)>test_size) & (len(data_y)>test_size):
                pass
            else:
                results[(flow_y, prod_y)] = {
                    'r_squared': None,
                    'skipped_countries': skipped_countries
                }
                continue

            # Split for training / test set
            X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, 
                                                                test_size=test_size, random_state=42)

            # Run regression
            lm = linear_model.LinearRegression() 
            lm.fit(X_train, y_train)

            # Extract coefficient of determination (r^2)
            r_squared = lm.score(X_test, y_test)
            #y_pred = lm.predict(X_test)
            #r_squared2 = metrics.r2_score(y_test, y_pred)

            logging.info('rsquared: {}'.format(r_squared)) #, r_squared2))
            logging.info('num skipped countries: {}'.format(len(skipped_countries)))
            logging.info('num training countries: {}'.format(len(X_train)))
            logging.info('num testing countries: {}'.format(len(X_test)))

            # Store results
            results[(flow_y, prod_y)] = {
                'r_squared': r_squared,
                'skipped_countries': skipped_countries
            }
            

    return results

def pretty_print_results(data_tuple, df_prod_names, df_flow_names):
    
    flow_y, prod_y = data_tuple[0]
    
    
    ## ALERT TO MATERIAL FLOWS!!!! DATA DOESNT USE SHORTHAND FOR EXPORT AND IMPORT
    prod_y_name = df_prod_names.loc[prod_y, 'v2']
    try:
        flow_y_name = df_flow_names.loc[flow_y, 'flow.name']
    except:
        flow_y_name = flow_y
    
    new_tuple = (('compass_of_divergence', flow_y_name, prod_y_name), data_tuple[1])
    
    return new_tuple
    

# Run Regressions

In [70]:
kwargs = {
    'datax': DATAX,
    'xname':'compass of divergence',
    'xyear': None,
    'xval_col': 'prod_degree',
    'xcountry_col': 'country',
    'datay': DATAY,
    'ystartyear': 2000,
    'yendyear': 2015,
    'yprod_col': 'mfa13',
    'yflow_col': 'flow',
    'yyear_col': 'time',
    'yval_col': 'value',
    'ycountry_col': 'isoalpha3',
    'test_size':30
}

regression_results = run_linear_regressions(**kwargs)

INFO:root:regressing GHG-GDP Divergence Index against DE of WCH
INFO:root:rsquared: -0.04362537410502765
INFO:root:num skipped countries: 24
INFO:root:num training countries: 136
INFO:root:num testing countries: 30
INFO:root:regressing GHG-GDP Divergence Index against Imports of WCH
INFO:root:rsquared: -0.24490256047829706
INFO:root:num skipped countries: 24
INFO:root:num training countries: 136
INFO:root:num testing countries: 30
INFO:root:regressing GHG-GDP Divergence Index against DMC of WCH
INFO:root:rsquared: -0.031658568432826106
INFO:root:num skipped countries: 21
INFO:root:num training countries: 139
INFO:root:num testing countries: 30
INFO:root:regressing GHG-GDP Divergence Index against DMI of WCH
INFO:root:rsquared: -0.029232790986355006
INFO:root:num skipped countries: 20
INFO:root:num training countries: 140
INFO:root:num testing countries: 30
INFO:root:regressing GHG-GDP Divergence Index against Exports of WCH
INFO:root:rsquared: -0.003167617665086997
INFO:root:num skippe

# Examining Results

In [73]:
regression_results

{('DE', 'COA'): {'r_squared': -0.036397006889257622,
  'skipped_countries': ['AFG',
   'AGO',
   'ARE',
   'ARM',
   'BEL',
   'BEN',
   'BHR',
   'BHS',
   'BRB',
   'BRN',
   'CAF',
   'CHE',
   'CIV',
   'CMR',
   'COD',
   'COG',
   'COM',
   'CPV',
   'CRI',
   'CUB',
   'CYP',
   'DJI',
   'DMA',
   'DZA',
   'ECU',
   'ERI',
   'FJI',
   'GAB',
   'GHA',
   'GIN',
   'GMB',
   'GNQ',
   'GRD',
   'GTM',
   'HND',
   'ISL',
   'ISR',
   'JAM',
   'JOR',
   'JPN',
   'KEN',
   'KHM',
   'KIR',
   'KWT',
   'LBR',
   'LBY',
   'LKA',
   'LUX',
   'MAR',
   'MDG',
   'MDV',
   'MLI',
   'MLT',
   'MNE',
   'MRT',
   'MUS',
   'NAM',
   'NIC',
   'NLD',
   'OMN',
   'PAN',
   'PNG',
   'QAT',
   'ROU',
   'SAU',
   'SGP',
   'SLB',
   'SLE',
   'SLV',
   'SOM',
   'SUN',
   'SUR',
   'ATG',
   'BFA',
   'AZE',
   'ETH',
   'BGD',
   'LBN',
   'BLZ',
   'BOL',
   'CSK',
   'DOM',
   'FSM',
   'GNB',
   'GUY',
   'HRV',
   'HTI',
   'IRQ',
   'LSO',
   'MDA',
   'MHL',
   'PLW',
   'PR

In [90]:
#logging.info('Number of regressions attempted: {}'.format(len(regression_results)))
#logging.info('Results: {}'.format(regression_results))

def pick_not_null(d):
    new_d = dict()
    for key, vals in d.items():
        if vals['r_squared']:
            new_d[key] = vals
    return new_d

notnull_results = pick_not_null(regression_results)


sorted_results = sorted(notnull_results.items(), 
                        key=lambda res: res[1]['r_squared'], 
                        reverse=True)
             
# Only keep non-perfect correlations, 
# and ones for which no more than 10 countries are skipped
filterd_sorted_results = [res for res in sorted_results if 
                  (res[1]['r_squared'] < 1) and 
                  (len(res[1]['skipped_countries']) < 40) ]

readable_results = list(map(lambda tup: pretty_print_results(tup, PROD_NAMES, FLOW_NAMES), 
                            filterd_sorted_results))

In [91]:
readable_results

[(('compass_of_divergence',
   'DMC',
   'Non-metallic minerals - construction dominant'),
  {'r_squared': 0.12948898546171339,
   'skipped_countries': ['AFG',
    'BHS',
    'COD',
    'DMA',
    'GRD',
    'MNE',
    'ROU',
    'SOM',
    'SUN',
    'CSK',
    'FSM',
    'LSO',
    'MDA',
    'MHL',
    'PLW',
    'PRK',
    'SCG',
    'SDN',
    'SRB',
    'SSD',
    'SWZ',
    'SYR',
    'VGB',
    'WSM',
    'YUG',
    'TLS',
    'TUV']}),
 (('compass_of_divergence',
   'DMC/cap',
   'Non-metallic minerals - construction dominant'),
  {'r_squared': 0.07348915068911932,
   'skipped_countries': ['AFG',
    'BHS',
    'COD',
    'DMA',
    'GRD',
    'MNE',
    'ROU',
    'SOM',
    'SUN',
    'CSK',
    'FSM',
    'LSO',
    'MDA',
    'MHL',
    'PLW',
    'PRK',
    'SCG',
    'SDN',
    'SRB',
    'SSD',
    'SWZ',
    'SYR',
    'VGB',
    'WSM',
    'YUG',
    'TCD',
    'TLS',
    'TUV']}),
 (('compass_of_divergence', 'Imports', 'Crops'),
  {'r_squared': 0.063672843133786405,
