In [None]:
import os
import json
import logging
import sys
LOG_LEVEL = logging.INFO
logging.basicConfig(stream=sys.stderr, level=LOG_LEVEL)

from functools import reduce
from sklearn import linear_model
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100

import matplotlib.pyplot as plt
%matplotlib inline

# Setting up the data

In [None]:
DATA_FOLDER = "/Users/nathansuberi/Desktop/RW_Data/com_009 Material Flows/"
os.listdir(DATA_FOLDER)

# 13 categories mapped to 4 main categories
MFA_13 = 'MFA13.csv'

# Region codes
REGIONS = 'Regions.csv'

# Flow type table: Flow.name, Unit, Axis.label
FLOW = 'Flow.csv'

# 11 sectors (product group) and their names
PRODUCT_GROUP = 'ProductGroup.csv'

# Detailed item codes w/ names, aggregate 3 letter codes
MATERIAL = 'Material.csv'

# Country info - ISO, Name, Region, ISONum3, ALPHANUMISO
COUNTRY = 'Country.csv'

# Names for 4 main categories
MFA_4 = 'MFA4.csv'


####
## Country / Regional data sets
####


# Country Data: Flow category, MFA13 code, MFA4 code, Time, Value
FLOW_MFA = 'FlowMFA.csv'

# Country data: by CCC_Code, Time, Value
FLOW_CCC = 'FlowCCC.csv'

# Country & Regional Data: ISO or Region (but not both), Flow type, Time, Value 
INDEX_DATA = 'IndexData.csv'

# Flow Data: Year, Source.Region, Consumer.Region, Material.Category, Final.Product, Value
FLOW_DETAILED = 'FlowDetailed.csv'

all_data = [FLOW_MFA, FLOW_CCC, INDEX_DATA, FLOW_DETAILED]
all_lookups = [MFA_13, REGIONS, FLOW, PRODUCT_GROUP,  
           COUNTRY, MATERIAL, MFA_4]

In [None]:
def read_df(df):
    path = DATA_FOLDER + df
    return pd.read_csv(path, sep=';')

# https://stackoverflow.com/questions/35979620/get-the-last-10000-lines-of-a-csv-file
# Read only the tail... better this way than running pd.read_csv().tail(num)
def read_last_100_rows(df):
    path = DATA_FOLDER + df
    size = sum(1 for l in open(path))
    return pd.read_csv(path, skiprows=range(100, size - 100), sep = ';')

In [None]:
def add_to_dict(agg, df):
    print(df)
    agg[df] = read_df(df)
    return agg

In [None]:
data = reduce(add_to_dict, all_data, {})

In [None]:
lookups = reduce(add_to_dict, all_lookups, {})

In [None]:
data[FLOW_MFA].head()

In [None]:
data[FLOW_CCC].head()

In [None]:
data[INDEX_DATA].head()

In [None]:
data[FLOW_DETAILED].head()

In [None]:
lookups[MFA_13]

# Looking into the FLOW_MFA dataset

In [None]:
data[FLOW_MFA].columns

In [None]:
data[FLOW_MFA]['ISOAlpha3'].unique()

In [None]:
data[FLOW_MFA]['Flow'].unique()

In [None]:
data[FLOW_MFA]['MFA13'].unique()

In [None]:
data[FLOW_MFA]['MFA4'].unique()

In [None]:
data[FLOW_MFA]['Time'].unique()

# Running a linear model against sections of the data

In [None]:
def extract_unique(df, col):
    return df[col].unique()

def run_linear_regressions(data, year, 
                           prod_col, flow_col,
                           year_col, val_col, 
                           country_col):
    '''
    Inputs: Data, and
    Outputs: square matrix of regression coefficients for each indicator
    '''
    data = data.copy()
    
    # Only look at comparisons of traded products
    data = data[pd.notnull(data[prod_col])]
        
    # Create lists of countries, products, and flows to loop over
    all_countries, all_products, all_flows = [extract_unique(data, col) for col in [country_col, prod_col, flow_col]]
    logging.debug('all_countries: {}'.format(all_countries))
    logging.debug('all_products: {}'.format(all_products))
    logging.debug('all_flows: {}'.format(all_flows))
    
    # Result will be an upper right triangular square matrix in 4 dimensions
    results = {}
    
    for ix_prod_x, prod_x in enumerate(all_products):
        for ix_prod_y, prod_y in enumerate(all_products[:ix_prod_x+1]):
            for ix_flow_x, flow_x in enumerate(all_flows):
                for ix_flow_y, flow_y in enumerate(all_flows[:ix_flow_x+1]):
                    # Extract data
                    # TO DO: allow for year ranges

                    logging.debug('flow x: {}'.format(flow_x))
                    logging.debug('prod x: {}'.format(prod_x))
                    logging.debug('flow y: {}'.format(flow_y))
                    logging.debug('prod y: {}'.format(prod_y))
                    
                    msg = "regressing {flow_x} of {prod_x} against {flow_y} of {prod_y}"
                    msg = msg.format(flow_x = flow_x,
                              flow_y = flow_y,
                               prod_x = prod_x,
                               prod_y = prod_y)
                    
                    logging.info(msg)
                    
                    data_x = data.loc[(data[prod_col]==prod_x) & (data[year_col]==year) & (data[flow_col]==flow_x)]
                    data_y = data.loc[(data[prod_col]==prod_y) & (data[year_col]==year) & (data[flow_col]==flow_y)]

                    # Throw away all but intersection of countries
                    keep_countries = set(data_x[country_col]) & set(data_y[country_col])
                    skipped_countries = [country for country in all_countries if country not in keep_countries]
                    
                    data_x = data_x.set_index(country_col).loc[keep_countries, val_col]
                    data_y = data_y.set_index(country_col).loc[keep_countries, val_col]
                    
                    # Reshape for regression
                    data_x = data_x.values.reshape(-1, 1)
                    data_y = data_y.values.reshape(-1, 1)

                    r_squared = -1
                    if data_x.shape[0] > 0:
                        # Run regression
                        lm = linear_model.LinearRegression() 
                        lm.fit(data_x, data_y)

                        # Extract coefficient of determination (r^2)
                        r_squared = lm.score(data_x, data_y)

                    # Store results
                    results[(flow_x, prod_x, flow_y, prod_y)] = {
                        'r_squared': r_squared,
                        'skipped_countries': skipped_countries
                    }

    return results

def pretty_print_results(data_tuple, df_prod_names, df_flow_names):
    
    flow_x, prod_x, flow_y, prod_y = data_tuple[0]
    
    ## ALERT TO MATERIAL FLOWS!!!! DATA DOESNT USE SHORTHAND FOR EXPORT AND IMPORT
    prod_x_name = df_prod_names.loc[prod_x, 'V2']
    prod_y_name = df_prod_names.loc[prod_y, 'V2']
    try:
        flow_x_name = df_flow_names.loc[flow_x, 'Flow.name']
    except:
        flow_x_name = flow_x

    try:
        flow_y_name = df_flow_names.loc[flow_y, 'Flow.name']
    except:
        flow_y_name = flow_y
    
    new_tuple = ((flow_x_name, prod_x_name, flow_y_name, prod_y_name), data_tuple[1])
    
    return new_tuple
    

In [None]:
kwargs = {
    'data': data[FLOW_MFA],
    'year': 2015,
    'country_col': 'ISOAlpha3',
    'prod_col': 'MFA13',
    'flow_col':'Flow',
    'year_col': 'Time',
    'val_col': 'Value'
}
regression_results = run_linear_regressions(**kwargs)

In [None]:
lookups[FLOW]

In [None]:
lookups[MFA_13]

# Examining results of the regressions

In [None]:
logging.info('Number of regressions attempted: {}'.format(len(regression_results))
logging.info('Results: {}'.format(regression_results))

sorted_results = sorted(regression_results.items(), 
                        key=lambda res: res[1]['r_squared'], 
                        reverse=True)
             
# Only keep non-perfect correlations, 
# and ones for which no more than 10 countries are skipped
filterd_sorted_results = [res for res in sorted_results if 
                  (res[1]['r_squared'] < 1) and 
                  (len(res[1]['skipped_countries']) < 10) ]

df_prod_names = lookups[MFA_13].copy().set_index('V1') 
df_flow_names = lookups[FLOW].copy().set_index('Flow')

readable_results = list(map(lambda tup: pretty_print_results(tup, df_prod_names, df_flow_names), 
                            filterd_sorted_results))

In [None]:
with open('com_009_material_flow_linear_regression_results.csv', 'w') as f:
    f.write(json.dumps(readable_results))

In [None]:
with open('com_009_material_flow_linear_regression_results.csv', 'r') as f:
    readable_results = json.loads(f.read())
readable_results