# Import libraries

In [2]:
import os
import json
import logging
import sys
LOG_LEVEL = logging.INFO
logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)

from functools import reduce
from sklearn import linear_model
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

import cartoframes

import matplotlib.pyplot as plt
%matplotlib inline

# Authenticating to Carto

In [3]:
CARTO_USER = 'wri-rw'
CARTO_KEY = '' #os.environ.get('CARTO_KEY', None)

cc = cartoframes.CartoContext(base_url='https://{}.carto.com/'.format(CARTO_USER),
                              api_key=CARTO_KEY)

# Setting up the data

In [4]:
DATA_FOLDER = "/Users/nathansuberi/Desktop/RW_Data/com_009 Material Flows/"
os.listdir(DATA_FOLDER)

# 13 categories mapped to 4 main categories
MFA_13 = 'MFA13.csv'

# Region codes
REGIONS = 'Regions.csv'

# Flow type table: Flow.name, Unit, Axis.label
FLOW = 'Flow.csv'

# 11 sectors (product group) and their names
PRODUCT_GROUP = 'ProductGroup.csv'

# Detailed item codes w/ names, aggregate 3 letter codes
MATERIAL = 'Material.csv'

# Country info - ISO, Name, Region, ISONum3, ALPHANUMISO
COUNTRY = 'Country.csv'

# Names for 4 main categories
MFA_4 = 'MFA4.csv'


####
## Country / Regional data sets
####


# Country Data: Flow category, MFA13 code, MFA4 code, Time, Value
FLOW_MFA = 'FlowMFA.csv'

# Country data: by CCC_Code, Time, Value
FLOW_CCC = 'FlowCCC.csv'

# Country & Regional Data: ISO or Region (but not both), Flow type, Time, Value 
INDEX_DATA = 'IndexData.csv'

# Flow Data: Year, Source.Region, Consumer.Region, Material.Category, Final.Product, Value
FLOW_DETAILED = 'FlowDetailed.csv'

all_data = [FLOW_MFA, FLOW_CCC, INDEX_DATA, FLOW_DETAILED]
all_lookups = [MFA_13, REGIONS, FLOW, PRODUCT_GROUP,  
           COUNTRY, MATERIAL, MFA_4]

In [5]:
def read_df(df):
    path = DATA_FOLDER + df
    return pd.read_csv(path, sep=';')

# https://stackoverflow.com/questions/35979620/get-the-last-10000-lines-of-a-csv-file
# Read only the tail... better this way than running pd.read_csv().tail(num)
def read_last_100_rows(df):
    path = DATA_FOLDER + df
    size = sum(1 for l in open(path))
    return pd.read_csv(path, skiprows=range(100, size - 100), sep = ';')

In [8]:
cc.read('com_009_{}'.format(os.path.splitext(FLOW_CCC)[0].lower()))

Unnamed: 0_level_0,ccc_code,isoalpha3,the_geom,time,value
cartodb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6303,A.3.5,SUN,,1970,12400.000000
17999,A.1.1.1.2,PRK,,1972,87.000000
27308,A.1.1.12,NPL,,1973,7.041000
29162,A.1.3.2,VUT,,1973,14.280000
37623,A.1.2.1.2,VGB,,1974,0.080000
42170,A.3.7.1,COL,,1974,1980.000000
42435,A.3.7.2,SYC,,1974,0.000000
51489,A.3.8.1,POL,,1975,603.000000
52409,A.4.1.2.2,IRQ,,1975,0.000000
55105,A.1.1.9,MAR,,1976,7.700000


In [11]:
def load_to_carto(ds):
    name = 'com_009_{}'.format(os.path.splitext(ds)[0])
    cc.write(read_df(ds), name, overwrite=True)
    
#list(map(load_to_carto, all_data))
#list(map(load_to_carto, all_lookups))

In [None]:
## This doesn't work because data set too big
# load_to_carto(FLOW_DETAILED)

flow_detailed = read_df(FLOW_DETAILED)
first_half_flow = flow_detailed.loc[:flow_detailed.shape[0]/2]
second_half_flow = flow_detailed.loc[flow_detailed.shape[0]/2:]

In [12]:
cc.write(first_half_flow, 'com_009_{}'.format(os.path.splitext(FLOW_DETAILED)[0]))

The following columns were changed in the CARTO copy of this dataframe:
[1mYear[0m -> [1myear[0m
[1mSource.Region[0m -> [1msource_region[0m
[1mConsumer.Region[0m -> [1mconsumer_region[0m
[1mMaterial.Category[0m -> [1mmaterial_category[0m
[1mFinal.Product[0m -> [1mfinal_product[0m
[1mValue[0m -> [1mvalue[0m


  warn('Table will be named `{}`'.format(table_name))
Uploading in batches: 100%|██████████| 38/38 [12:45<00:00, 15.96s/it]


Table successfully written to CARTO: https://wri-rw.carto.com/dataset/com_009_flowdetailed


In [13]:
cc.write(second_half_flow, 'com_009_{}b'.format(os.path.splitext(FLOW_DETAILED)[0]))

The following columns were changed in the CARTO copy of this dataframe:
[1mYear[0m -> [1myear[0m
[1mSource.Region[0m -> [1msource_region[0m
[1mConsumer.Region[0m -> [1mconsumer_region[0m
[1mMaterial.Category[0m -> [1mmaterial_category[0m
[1mFinal.Product[0m -> [1mfinal_product[0m
[1mValue[0m -> [1mvalue[0m


  warn('Table will be named `{}`'.format(table_name))
Uploading in batches: 100%|██████████| 38/38 [15:56<00:00, 17.95s/it]


Table successfully written to CARTO: https://wri-rw.carto.com/dataset/com_009_flowdetailedb


In [4]:
def add_to_dict(agg, df):
    print(df)
    agg[df] = read_df(df)
    return agg

In [5]:
data = reduce(add_to_dict, all_data, {})

FlowMFA.csv
FlowCCC.csv
IndexData.csv
FlowDetailed.csv


In [6]:
lookups = reduce(add_to_dict, all_lookups, {})

MFA13.csv
Regions.csv
Flow.csv
ProductGroup.csv
Country.csv
Material.csv
MFA4.csv


In [None]:
data[FLOW_MFA].head()

In [None]:
data[FLOW_CCC].head()

In [None]:
data[INDEX_DATA].head()

In [None]:
data[FLOW_DETAILED].head()

In [None]:
lookups[MFA_13]

# Looking into the FLOW_MFA dataset

In [7]:
data[FLOW_MFA].columns

Index(['ISOAlpha3', 'Flow', 'MFA13', 'MFA4', 'Time', 'Value'], dtype='object')

In [8]:
data[FLOW_MFA]['ISOAlpha3'].unique()

array(['ABW', 'AFG', 'AGO', 'ALB', 'AND', 'ANT', 'ARE', 'ARG', 'ARM',
       'ATG', 'AUS', 'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD',
       'BGR', 'BHR', 'BHS', 'BIH', 'BLR', 'BLZ', 'BMU', 'BOL', 'BRA',
       'BRB', 'BRN', 'BTN', 'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN',
       'CIV', 'CMR', 'COD', 'COG', 'COL', 'COM', 'CPV', 'CRI', 'CSK',
       'CUB', 'CYM', 'CYP', 'CZE', 'DEU', 'DJI', 'DMA', 'DNK', 'DOM',
       'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI',
       'FRA', 'FSM', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB',
       'GNQ', 'GRC', 'GRD', 'GRL', 'GTM', 'GUY', 'HKG', 'HND', 'HRV',
       'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR',
       'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KIR',
       'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY', 'LIE', 'LKA', 'LSO',
       'LTU', 'LUX', 'LVA', 'MAC', 'MAR', 'MCO', 'MDA', 'MDG', 'MDV',
       'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR', 'MNE', 'MNG', 'MOZ',
       'MRT', 'MUS',

In [9]:
data[FLOW_MFA]['Flow'].unique()

array(['RMC', 'RME_IMP', 'DE', 'DMC', 'DMC/cap', 'DMI', 'Exports', 'GDP',
       'Imports', 'Population', 'PTB', 'RME_EXP', 'RMI', 'DMC/GDP',
       'GDP/DMC', 'DE/cap'], dtype=object)

In [10]:
data[FLOW_MFA]['MFA13'].unique()

array([nan, 'CRO', 'CRR', 'GBF', 'WCH', 'WOO', 'COA', 'NGA', 'PET', 'NFO',
       'NMC', 'NMI', 'FOR', 'OST'], dtype=object)

In [11]:
data[FLOW_MFA]['MFA4'].unique()

array(['BM', 'FF', 'MO', 'NM', nan], dtype=object)

In [12]:
data[FLOW_MFA]['Time'].unique()

array([1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
       2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011,
       2012, 2013, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978,
       1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
       2014, 2015, 2016, 2017])

In [23]:
lookups[MATERIAL].shape

(62, 3)

# Running a linear model against sections of the data

In [None]:
def extract_unique(df, col):
    return df[col].unique()

def run_linear_regressions(data, year, 
                           prod_col, flow_col,
                           year_col, val_col, 
                           country_col):
    '''
    Inputs: Data, and
    Outputs: square matrix of regression coefficients for each indicator
    '''
    data = data.copy()
    
    # Only look at comparisons of traded products
    data = data[pd.notnull(data[prod_col])]
        
    # Create lists of countries, products, and flows to loop over
    all_countries, all_products, all_flows = [extract_unique(data, col) for col in [country_col, prod_col, flow_col]]
    logging.debug('all_countries: {}'.format(all_countries))
    logging.debug('all_products: {}'.format(all_products))
    logging.debug('all_flows: {}'.format(all_flows))
    
    # Result will be an upper right triangular square matrix in 4 dimensions
    results = {}
    
    for ix_prod_x, prod_x in enumerate(all_products):
        for ix_prod_y, prod_y in enumerate(all_products[:ix_prod_x+1]):
            for ix_flow_x, flow_x in enumerate(all_flows):
                for ix_flow_y, flow_y in enumerate(all_flows[:ix_flow_x+1]):
                    # Extract data
                    # TO DO: allow for year ranges

                    logging.debug('flow x: {}'.format(flow_x))
                    logging.debug('prod x: {}'.format(prod_x))
                    logging.debug('flow y: {}'.format(flow_y))
                    logging.debug('prod y: {}'.format(prod_y))
                    
                    msg = "regressing {flow_x} of {prod_x} against {flow_y} of {prod_y}"
                    msg = msg.format(flow_x = flow_x,
                              flow_y = flow_y,
                               prod_x = prod_x,
                               prod_y = prod_y)
                    
                    logging.info(msg)
                    
                    data_x = data.loc[(data[prod_col]==prod_x) & (data[year_col]==year) & (data[flow_col]==flow_x)]
                    data_y = data.loc[(data[prod_col]==prod_y) & (data[year_col]==year) & (data[flow_col]==flow_y)]

                    # Throw away all but intersection of countries
                    keep_countries = set(data_x[country_col]) & set(data_y[country_col])
                    skipped_countries = [country for country in all_countries if country not in keep_countries]
                    
                    data_x = data_x.set_index(country_col).loc[keep_countries, val_col]
                    data_y = data_y.set_index(country_col).loc[keep_countries, val_col]
                    
                    # Reshape for regression
                    data_x = data_x.values.reshape(-1, 1)
                    data_y = data_y.values.reshape(-1, 1)

                    # Split for training / test set
                    X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, 
                                                                        test_size=30, random_state=42)
                    
                    r_squared = -1
                    if data_x.shape[0] > 0:
                        # Run regression
                        lm = linear_model.LinearRegression() 
                        lm.fit(X_train, y_train)

                        # Extract coefficient of determination (r^2)
                        r_squared = lm.score(X_test, y_test)

                    # Store results
                    results[(flow_x, prod_x, flow_y, prod_y)] = {
                        'r_squared': r_squared,
                        'skipped_countries': skipped_countries
                    }

    return results

def pretty_print_results(data_tuple, df_prod_names, df_flow_names):
    
    flow_x, prod_x, flow_y, prod_y = data_tuple[0]
    
    ## ALERT TO MATERIAL FLOWS!!!! DATA DOESNT USE SHORTHAND FOR EXPORT AND IMPORT
    prod_x_name = df_prod_names.loc[prod_x, 'V2']
    prod_y_name = df_prod_names.loc[prod_y, 'V2']
    try:
        flow_x_name = df_flow_names.loc[flow_x, 'Flow.name']
    except:
        flow_x_name = flow_x

    try:
        flow_y_name = df_flow_names.loc[flow_y, 'Flow.name']
    except:
        flow_y_name = flow_y
    
    new_tuple = ((flow_x_name, prod_x_name, flow_y_name, prod_y_name), data_tuple[1])
    
    return new_tuple
    

In [None]:
kwargs = {
    'data': data[FLOW_MFA],
    'year': 2015,
    'country_col': 'ISOAlpha3',
    'prod_col': 'MFA13',
    'flow_col':'Flow',
    'year_col': 'Time',
    'val_col': 'Value'
}
regression_results = run_linear_regressions(**kwargs)

In [None]:
lookups[FLOW]

In [None]:
lookups[MFA_13]

# Examining results of the regressions

In [None]:
logging.info('Number of regressions attempted: {}'.format(len(regression_results))
logging.info('Results: {}'.format(regression_results))

sorted_results = sorted(regression_results.items(), 
                        key=lambda res: res[1]['r_squared'], 
                        reverse=True)
             
# Only keep non-perfect correlations, 
# and ones for which no more than 10 countries are skipped
filterd_sorted_results = [res for res in sorted_results if 
                  (res[1]['r_squared'] < 1) and 
                  (len(res[1]['skipped_countries']) < 10) ]

df_prod_names = lookups[MFA_13].copy().set_index('V1') 
df_flow_names = lookups[FLOW].copy().set_index('Flow')

readable_results = list(map(lambda tup: pretty_print_results(tup, df_prod_names, df_flow_names), 
                            filterd_sorted_results))

In [None]:
with open('com_009_material_flow_linear_regression_results.csv', 'w') as f:
    f.write(json.dumps(readable_results))

In [None]:
with open('com_009_material_flow_linear_regression_results.csv', 'r') as f:
    readable_results = json.loads(f.read())
readable_results