In [195]:
import pandas as pd
import numpy as np
from scipy import io
import plotly.express as px
import math
import re
import random as rnd
import json

In [196]:
sectors = [
    'Agriculture', 'Mining', 'Oil/Gas', 'Mining Support', 'Util', 'Const', 'Wood', 'Minerals', 'Primary Metals', 'Fabricated Metals',
    'Machinery', 'Computers', 'Electrical', 'Vehicles', 'Transport', 'Furniture',
    'Misc Mfg', 'Food Mfg', 'Textile', 'Apparel', 'Paper', 'Printing', 'Petroleum', 
    'Chemical', 'Plastics', 'Wholesale Trade', 'Retail Trade', 'Transit/Warehouse', 
    'Info', 'Finance/Insurance', 'Real Estate', 'Rental', 'Prof/Tech', 'Mgmt', 'Admin', 'Educ', 
    'Health', 'Arts', 'Accom', 'Food Services', 'Other Services'
]

In [197]:
len(sectors)

41

In [198]:
invnet = io.loadmat('./investmentNetwork/Replication Packet/Investment Network Construction/invmatdat_41.mat')

In [199]:
avg_net = pd.DataFrame(np.mean(invnet['invmat'], axis=2))
avg_net.columns = sectors
avg_net.index = sectors

The investment network was constructed based on sector-level asset investments and production using the below equation:

$$ I_{ijt} = \sum_{a=1}^{A}\omega_{iat} I_{ajt}^{exp}$$

Where $I$ is the (37x37x72) investment matrix for 1947-2018. $I_{ajt}^{exp}$ is the expenditure by sector $j$ on asset $a$ in year $t$. $\omega_{iat}$ is the fraction of asset $a$ produced by sector $i$ in year $t$. The network below is the mean (37x37) matrix across time. 

The original network relies on a number of assumptions that largely contribute to the investment hubs it identified. These assumptioins are mostly due to the lack of data on sector-level asset production.

1. Construction sector produces all non-mining structures
2. Prof/Technical Services produce all custom software and R&D
3. Artistic originals are only produced by info/comm or arts
4. Info/comm produce all pre-packaged software but not delivery
5. Wood Manufacturing, insurance/finance, and Prof/Tech services produce residential structures but not non-residential structures

In addition to these theoretic assumptions, much of the data pre-1997 is estimated bluntly by taking the average production/expenditure ratios post-1997 or interpolating with data from 1987 and 1992.

Moreover, the asset expenditure data was flagged as unreliable by BEA and it was specifically said they're "more likely to be based on judgemental trends, on trends in the higher level aggregate, or on less reliable source data." On the other hand, the asset production data is mostly just for "equipment" and the allocations are based on the assumptions above.

Are there asset classes baked into the macro model? If so, that would be a natural set of assets to base an investment network off of

In [200]:
fig = px.imshow(avg_net)
fig.update_layout(
    autosize=False,
    width=800,
    height=800
)
fig.show()

### Simple Version of Investment Network

In this network, I aggregate the sectors into 18 that map more cleanly between BEA codes and the ISIC codes and attempt to construct an investment network using the same methods as VA. The loss is the granularity of the data into 37+ sectors and I will only use 1997+ data.

### Attempt at Mapping BEA -> NAICS -> ISIC/NACE

One of the issues with the mapping between NAICS and ISIC is that both the Investment Network paper and the macro model use intermediate aggregates of the coding scheme while the mapping is on the lowest-level. I haven't found any mappings that map this intermediate level so I have to map from NAICS-mid -> NAICS-low -> ISIC-low -> ISIC-mid. Which is a many-to-many mapping. 

The approach I'm taking above maps the highest-level aggregate sectors into 18 relatively clean-to-match sectors and reconstructs an investment matrix for these using VW's approach.

In [201]:
# compares to "Replication Packet/Converting SIC to NAICS/SIC_NAICS_BEA_allsec.do"

NAICS_groups = {
    'Agriculture': [111,112,113,114,115], # excluded
    'Mining': [212],
    'Oil/Gas': [211], # excluded
    'Mining Support': ['212'], # excluded
    'Util': [22], # 221 doesn't exist in NAICS codes in asset expenditures but in VW mapping
    'Const': [23],
    'Wood': [321],
    'Minerals': [327],
    'Primary Metals': [331],
    'Fabricated Metals': [332],
    'Machinery': [333],
    'Computers': [334],
    'Electrical': [335],
    'Vehicles': [3361,3362,3363],
    'Transport': [3364,3365,3366,3367,3368,3369],
    'Furniture': [337],
    'Misc Mfg': [339],
    'Food Mfg': [311,312],
    'Textile': [313,314],
    'Apparel': [315,316],
    'Paper': [322],
    'Printing': [323],
    'Petroleum': [324],
    'Chemical': [325],
    'Plastics': [326],
    'Wholesale Trade': [42],
    'Retail Trade': [44,45],
    'Transit/Warehouse': [48,49],
    'Info': [51],
    'Finance/Insurance': [52],
    'Real Estate': [531],
    'Rental': [532,533], # excluded
    'Prof/Tech': [54],
    'Mgmt': [55],
    'Admin': [561,562], # just 56 in the VW mapping
    'Educ': [61],
    'Health': [62],
    'Arts': [71],
    'Accom': [721],
    'Food Services': [722],
    'Other Services': [81]
}

In [202]:
len(NAICS_groups)

41

In [203]:
naics_isic_df = pd.read_csv('./raw_data/NAICS_ISIC.csv', dtype={'NAICS2012Code': str, 'ISIC4Code': str})

naics_isic_map = {}
for i, row in naics_isic_df.iterrows():
    naics_isic_map[str(row[0])] = row[2]
    

In [204]:
ISIC_groups = {}
for key, ids in NAICS_groups.items():
    ISIC_groups[key] = []
    for i in ids:
        for naics_code, isic_code in naics_isic_map.items():
            id_len = len(str(i))
            if not naics_code is None and naics_code[0:id_len] == str(i):
                ISIC_groups[key].append(str(isic_code))
    ISIC_groups[key] = list(set(ISIC_groups[key]))

In [205]:
isic_nace_df = pd.read_csv('./raw_data/ISIC_NACE.csv', dtype={'ISIC4code': str, 'NACE2code': str})

isic_nace_map = {}
for i, row in isic_nace_df.iterrows():
    isic_nace_map[str(row[0])] = row[2]

In [206]:
NACE2_groups = {}
for key, ids in ISIC_groups.items():
    NACE2_groups[key] = []
    for i in ids:
        for isic_code, nace_code in isic_nace_map.items():
            id_len = len(str(i))
            if not isic_code is None and isic_code[0:id_len] == str(i):
                NACE2_groups[key].append(str(nace_code))
    NACE2_groups[key] = np.sort(list(set(NACE2_groups[key])))

In [207]:
# mining == mining support (08), fabricated metals == furniture (25)
NACE2_groups_agg = {key: pd.Series([x[0:2] for x in val]).value_counts().index for key, val in NACE2_groups.items()} 
# mining == mining support (08), minerals == petroleum (23)
ISIC_groups_agg = {key: pd.Series([x[0:2] for x in val]).value_counts().index for key, val in ISIC_groups.items()}

In [208]:
sea_df = pd.read_csv('sample_data/Socio_Economic_Accounts.csv')

our_map = {}
for i, row in sea_df.iterrows():
    our_map[row[2]] = row[3]

In [209]:
def clean_isic(x):
    ids = re.findall('\d{2}', x)
    if len(ids) == 0:
        return [x]
    elif len(ids) == 1:
        return [int(x[1:])]
    else:
        int_ids = [int(id) for id in ids]
        min = np.min(int_ids)
        max = np.max(int_ids)
        return list(np.arange(min, max))
    
clean_our_map = {}
for key in our_map.keys():
    clean_our_map[key] = clean_isic(our_map[key])

In [210]:
isic_ours_map = {}
for isic in ISIC_groups_agg.items():
    isic_ours_map[isic[0]] = []
    for ours in clean_our_map.items():
        if len(set(ours[1]).intersection(set([int(i) for i in isic[1]]))) > 0:
            isic_ours_map[isic[0]].append(our_map[ours[0]])

In [211]:
isic_ours_map_manual_edit = {
    'Agriculture': ['A01', 'A02', 'A03', 'C10-C12', 'C16'],
    'Mining': ['B'],
    'Oil/Gas': [],
    'Mining Support': ['B'],
    'Util': ['D35', 'E37-E39', 'H49'],
    'Const': ['F'],
    'Wood': ['C16', 'C31_C32'],
    'Minerals': ['C23'],
    'Primary Metals': ['C24', 'C25', 'C27', 'C28', 'C30'],
    'Fabricated Metals': ['C24', 'C25', 'C28', 'C29', 'C30'],
    'Machinery': ['C25', 'C26', 'C28', 'C30'],
    'Computers': ['C18', 'C26', 'C30'],
    'Electrical': ['C27'],
    'Vehicles': ['C25', 'C29', 'C30'],
    'Transport': ['C30', 'C33'],
    'Furniture': ['C25', 'C31_C32'],
    'Misc Mfg': ['C27', 'C28'],
    'Food Mfg': ['C10-C12', 'C20', 'D35'],
    'Textile': ['C13-C15', 'C22'],
    'Apparel': ['C13-C15', 'C22'],
    'Paper': ['C17', 'C23', 'C25'],
    'Printing': ['C18'],
    'Petroleum': ['C19', 'C23'],
    'Chemical': ['C20', 'C21', 'C22', 'C28'],
    'Plastics': ['C22', 'C27', 'C31_C32'],
    'Wholesale Trade': ['G45', 'G46'],
    'Retail Trade': ['G45', 'G47'],
    'Transit/Warehouse': ['H49', 'H50', 'H51', 'H52', 'H53'],
    'Info': ['J58', 'J59_J60', 'J61', 'M74_M75'],
    'Finance/Insurance': ['K64', 'K65', 'K66', 'L68'],
    'Real Estate': ['L68'],
    'Rental': [],
    'Prof/Tech': ['J62_J63', 'M69_M70', 'M71', 'M72', 'M73', 'M74_M75'],
    'Mgmt': ['K64'],
    'Admin': ['E37-E39', 'O84'],
    'Educ': ['P85'],
    'Health': ['Q'],
    'Arts': ['M74_M75'],
    'Accom': ['I'],
    'Food Services': ['I'],
    'Other Services': ['C33', 'G45', 'M71', 'M74_M75']
 }

In [212]:
# avg_net = pd.DataFrame(np.mean(invnet['invmat'], axis=2))
# selection = [rnd.choice(isic_ours_map_manual_edit[s]) for s in sectors]
# avg_net.columns = selection
# avg_net.index = selection

In [213]:
# fig = px.imshow(avg_net)
# fig.update_layout(
#     autosize=False,
#     width=800,
#     height=800
# )
# fig.show()

In [214]:
avg_net = pd.DataFrame(np.mean(invnet['invmat'], axis=2))
avg_net.columns = sectors
avg_net.index = sectors

In [215]:
fig = px.imshow(avg_net)
fig.update_layout(
    autosize=False,
    width=800,
    height=800
)
fig.show()

## Quick and Dirty Mapping

This maps and aggregates the VW investment matrix into the ISIC high-level sectors (A-U).

In [216]:
agg_sectors = {
 'A': ['Agriculture'],
 'B': ['Mining', 'Oil/Gas', 'Mining Support'],
 'C': ['Wood',
  'Minerals',
  'Primary Metals',
  'Fabricated Metals',
  'Machinery',
  'Computers',
  'Electrical',
  'Vehicles',
  'Transport',
  'Furniture',
  'Misc Mfg',
  'Food Mfg',
  'Textile',
  'Apparel',
  'Paper',
  'Printing',
  'Petroleum',
  'Chemical',
  'Plastics',
  'Other Services'],
 'D': ['Util'],
 'E': ['Admin'],
 'F': ['Const'],
 'G': ['Wholesale Trade', 'Retail Trade'],
 'H': ['Transit/Warehouse'],
 'I': ['Accom', 'Food Services'],
 'J': ['Info'],
 'K': ['Finance/Insurance', 'Mgmt'],
 'L': ['Real Estate'],
 'M': ['Prof/Tech'],
 'N': ['Rental', 'Admin'],
 'P': ['Educ'],
 'Q': ['Health'],
 'R': ['Arts'],
 'S': ['Other Services']}

# avg_net = pd.DataFrame(np.mean(invnet['invmat'], axis=2))


In [217]:
# looking at the past 22 years (e.g. 1997-2018)
avg_net = pd.DataFrame(np.mean(invnet['invmat'][:,:,-21:], axis=2))
avg_net.columns = sectors
avg_net.index = sectors

In [218]:
rows = []

# sum row-wise
for key, vals in agg_sectors.items():
    if len(vals) > 0:
        new_row = {s: 0 for s in sectors}
        new_row['sector'] = key
        for i, row in avg_net.iterrows():
            if i in vals:
                for i in range(len(sectors)):
                    new_row[sectors[i]] = new_row[sectors[i]] + row[i]
        rows.append(new_row)

In [219]:
row_summed_df = pd.DataFrame(rows)
summed_df = row_summed_df.copy()

sector_letters = []

# this doesn't take relative sub sector sizes into account *very rought*
# mean column-wise
for key, vals in agg_sectors.items():
    if len(vals)> 0:
        sector_letters.append(key)
        summed_df[key] = np.mean(row_summed_df[vals], axis=1)

rough_matrix = summed_df[sector_letters]
rough_matrix.index = sector_letters
rough_matrix.to_csv('rough_invmat.csv')

In [220]:
fig = px.imshow(rough_matrix)
fig.update_layout(
    autosize=False,
    width=800,
    height=800
)
fig.show()

### Limitations

It's essentially just grouping the sub-sectors, summing row-wise and taking the mean column-wise. Which means it's not weighted for sub-sector size (e.g. if wholesale trade and retail trade purchase 10% and 20% of their total expenditures from manufacturing, G which is their combined sector, will have 15% expenditure to manufacturing regardless of relative size).

In addition, it's missing A (agriculture), E (waste management), O (public admin), T (household activities), U (extraterritorial) since the paper was focused on non-farm firms. If I were to repeat their process using the full data, I could add these to the matrix but that would take a bit more time.

## Network Recreation

For now, using the same data that VW use to create the network.

In [221]:
expenditure_data = './raw_data/assetInvestments.xlsx'

In [223]:
pd.set_option('display.max_rows', 200)

class InvestmentNetworkPipe:

    def __init__(self, path, industries):

        dfs = []
        for ind in industries:
            dfs.append(np.array(self.extract_expenditure_table(path, ind).iloc[:,3:]))

        dfs = np.array(dfs)

        yearly_dfs = []
        for year in range(dfs.shape[2]):
            yearly_dfs.append(dfs[:,:,year])

        yearly_dfs = np.array(yearly_dfs)

        full_tab = self.extract_expenditure_table(path, ind)

        self.matrix = yearly_dfs
        self.industries = industries
        self.assets = list(full_tab['NIPA Asset Types'])
        self.asset_codes = list(full_tab['Asset Codes'])

    def display_df(self, year):
        return pd.DataFrame(self.matrix[year % 2000], index=self.industries, columns=self.assets)
    
    def display_total_expenditures(self, year = None):

        if year is None:
            return pd.DataFrame(self.matrix.sum(axis=2).sum(axis=0), index=self.industries, columns=['total'])

    def extract_expenditure_table(self, path, industry):

        df = pd.read_excel(path, header=5, sheet_name=industry).iloc[2:].reset_index(drop=True)
        types = ['equipment' for _ in range(39)] + ['structures' for _ in range(32)] + ['intellectual' for _ in range(25)]
        df.drop([39,72], axis=0, inplace=True)
        df.drop([str(y) for y in range(1901,2000)], axis=1, inplace=True)
        df['type'] = types
        df = df[['type'] + list(df.columns[0:-1])]

        return df.reset_index(drop=True)
    
    def get_year(self, year, code=True, sectoral=True):
        
        df = pd.DataFrame(self.matrix[year % 2000])
        print(df.shape)
        
        if code: df.columns = self.asset_codes
        else: df.columns = self.assets
        df.index = self.industries

        return df

pipe = InvestmentNetworkPipe(expenditure_data, list(bea_naics_map.keys()))

In [224]:

def convert_to_isic(pipe, year):
        
    with open('bea_isic_map.json', 'r') as f:
        bea_isic = json.loads(f.read())

    # gets rid of "other services, except government"
    one_year = pipe.get_year(year, code=True).iloc[:,:-1]
    one_year.index = [bea_isic[i] for i in list(one_year.index)]

    rows = []
    for sector in np.sort(np.unique(one_year.index)):

        if len(one_year.loc[sector].shape) > 1:
            row = { 'sector': sector, **dict(one_year.loc[sector].sum(axis=0))}
        else:
            row = { 'sector': sector, **dict(one_year.loc[sector])}
            
        rows.append(row)

    return pd.DataFrame(rows)

In [225]:
with open('vw_bea_map.json', 'r') as f:
    vw_bea_map = json.loads(f.read())

ind_expenditures = pipe.display_total_expenditures()

vw_total_exp = {}
for sec, ind_vals in vw_bea_map.items():
    vw_total_exp[sec] = np.sum(ind_expenditures.loc[ind_vals]).values[0]


## Build Matrices and Combine

In [36]:
with open('isic_vw.json', 'r') as f:
    isic_vw = json.loads(f.read())

with open("oecd_vw.json", "r") as f:
    oecd_vw = json.loads(f.read())

In [37]:
from sklearn.preprocessing import normalize

ind = []
for key, val in oecd_vw.items():
    ind = ind + val

print(len(vw_total_exp))
print(len(ind))
avg_net.shape

industries = list(avg_net.columns)
sectors = list(oecd_vw.keys())

# row summing matrix
num_splits = {}
R = np.zeros((len(sectors), len(industries)))
for sec_i, sec in enumerate(sectors):
    for ind_i, ind in enumerate(industries):
        splits = np.sum([1 for k in oecd_vw.keys() if ind in oecd_vw[k]])
        R[sec_i,ind_i] = 1 / splits if ind in oecd_vw[sec] else 0

# column weighted averaging matrix
C = np.zeros((len(industries), len(sectors)))
for ind_i, ind in enumerate(industries):
    for sec_i, sec in enumerate(sectors):
        C[ind_i, sec_i] = vw_total_exp[ind] if ind in oecd_vw[sec] else 0

# C = normalize(C, axis=0, norm='l1')

print(R.shape)
print(np.array(avg_net).shape)
print(C.shape)

res = np.dot(np.dot(R, np.array(avg_net)), C)
res = normalize(res, axis = 0, norm='l1')

res = pd.DataFrame(res)
res.index = sectors
res.columns = sectors

res = res[np.sort(sectors)]
res.sort_index(inplace=True)

pd.set_option('display.max_columns', 100)

41
45
(44, 41)
(41, 41)
(41, 44)


In [40]:
fig = px.imshow(res)
fig.update_layout(
    autosize=False,
    width=800,
    height=800
)
fig.show()
res.to_csv('investment_matrix.csv')

In [39]:
fig = px.imshow(rough_matrix)
fig.update_layout(
    autosize=False,
    width=800,
    height=800
)
fig.show()

# Direct from BEA to OECD Matrtix

## Expenditure Data

In [141]:
expend_df = pd.read_csv("raw_data/BEA_Expenditures.csv", index_col=0)
expend_df = expend_df.apply(lambda x: [i.replace(",","") if type(i) is str else i for i in x])
expend_df = expend_df.astype(float)
expend_df["industry"] = expend_df.index.map(lambda x: x[3:7])
expend_df["asset"] = expend_df.index.map(lambda x: x[8:-2])
expend_df = expend_df[expend_df["asset"].map(lambda x: x not in ["EQ00", "ST00"])]
expend_df

Unnamed: 0,1901,1902,1903,1904,1905,1906,1907,1908,1909,1910,...,2014,2015,2016,2017,2018,2019,2020,2021,industry,asset
I3N110C1EP1A.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110C,EP1A
I3N110C1EP1B.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,3.0,3.0,3.0,3.0,3.0,4.0,5.0,110C,EP1B
I3N110C1EP1C.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110C,EP1C
I3N110C1EP1D.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110C,EP1D
I3N110C1EP1E.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,110C,EP1E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
I3N81001AE10.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8100,AE10
I3N81001AE20.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8100,AE20
I3N81001AE30.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8100,AE30
I3N81001AE40.A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8100,AE40


## Production Data

These assumptions follow VomLehn-Winberry, however, some subcategories were chosen by me as VW were always aggregating into their sector groups rather than at the BEA sector level.

### Equipment

VL Maps Commodity Codes from the PEQ bridge and Asset Codes from the Investment tables into 25 NIPA categories (ignoring residential equipment). The split production is weighted by the PEQ bridge.

### Structures

1. Mining structures (SM02) are produced by mining support activities (2130)
2. All other structures are produced by construction (2300)

### Intellectual Property

1. Prepackaged software (ENS1) is produced by publishing industries (5110)
2. Custom software (ENS2) is produced by computer systems design (5415)
3. Own-account software (ENS3) is produced by computer systems design (5415)
4. Scientific research and development services (RD11-92) is produced by misc. professional, scientific, and technical services (5412)
5. Theatrical movies (AE10) are produced by motion picutre and sound recording industries (5120)
6. Long-lived television programs (AE20) are produced by radio and television broadcasting (5150) (not included)
7. Books (AE30) are produced by publishing industries (5110)
8. Music (AE40) are produced by motion picture and sound recording industries (5120)
9. Other entertainment originals are categorized under Art, entertainment, and recreation (7100) but for now placed in subcategory Performing Arts, spectator sports, and related activities (711A)


**Uncaptured: Margin payments on pre-packaged sofware and equipment**

**Uncaptured: Real Estate is the sole purchaser of residential assets**

**Uncaptured: The recirculation of Used equipment**

## Equipment Investment Network

In [117]:
# map assets to NIPA categories
asset_nipa = {
    "EP1A": 4,
    "EP1B": 4,
    "EP1C": 4,
    "EP1D": 4,
    "EP1E": 4,
    "EP1F": 4,
    "EP1G": 4,
    "EP1H": 4,
    "EP20": 5,
    "EP34": 6,
    "EP35": 6,
    "EP36": 7,
    "EP31": 8,
    "EP12": 9,
    "EI11": 11,
    "EI12": 11,
    "EI21": 12,
    "EI22": 12,
    "EI30": 13,
    "EI40": 14,
    "EI50": 15,
    "EI60": 16,
    "ET11": 19,
    "ET12": 20,
    "ET20": 21,
    "ET30": 22,
    "ET40": 23,
    "ET50": 24,
    "EO11": 26,
    "EO12": 26,
    "EO30": 27,
    "EO21": 27,
    "EO40": 28,
    "EO22": 28,
    "EO50": 29,
    "EO60": 30,
    "EO71": 31,
    "EO72": 32,
    "EO80": 34
}

In [None]:
years = [str(y) for y in range(2000, 2021)]

In [164]:


def read_bridge(year):

    equip_df = pd.read_excel("PEQBridge_1997-2021_SUM.xlsx", sheet_name=year, header=4)
    equip_df.columns = ["nipa", "peq_name", "code", "description","prod_val","trans_costs", "wholesale", "retail", "buy_val", "year"]
    equip_df = equip_df[~(equip_df["code"] == "Used")]
    equip_df["code"] = equip_df["code"].map(lambda x: re.sub(r"[a-zA-Z]", "", str(x)))
    equip_df["code"] = equip_df["code"].map(lambda x: x + "0" if len(x) == 3 else x)
    equip_df = equip_df[~(equip_df["code"] == "Used")]
    equip_df = equip_df[["nipa", "code", "prod_val"]]
    return equip_df


nipa_split = {}
for y in years:

    equip_df = read_bridge(y)

    yearly = {}
    for n in equip_df["nipa"].unique():
        temp = {}
        for i, row in equip_df[equip_df["nipa"] == n].iterrows():
            tot = equip_df[equip_df["nipa"] == n]["prod_val"].sum()
            weight = row["prod_val"] / tot
            temp[row["code"]] = weight
        yearly[n] = temp
    nipa_split[y] = yearly

  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")
  warn("""Cannot parse header or footer so it will be ignored""")


In [142]:
# get just equipment expenditures
equip_expend_df = expend_df[expend_df["asset"].map(lambda x: x[0] == "E")]
# remove software products
equip_expend_df = equip_expend_df[equip_expend_df["asset"].map(lambda x: not x in ["ENS1","ENS2","ENS3"])]
equip_expend_df["nipa"] = equip_expend_df["asset"].map(lambda x: asset_nipa[x])

In [159]:
# industry by nipa asset category
equip_expend_df2 = equip_expend_df[years + ["industry", "nipa"]]
equip_expend_df2 = equip_expend_df2.groupby(["industry", "nipa"]).agg("sum")

VL's investment matrix calculation

$$I_{ijt}=\sum_{a=1}^{A}\omega_{iat}I_{ajt}^{exp}$$

In [189]:
industries = expend_df["industry"].unique()
n_industries = len(expend_df["industry"].unique())
equip_matrix = np.zeros((len(years), n_industries, n_industries))

for i in range(n_industries):
    for j in range(n_industries):
        for t in range(len(years)):
            temp = 0
            for a in nipa_split[years[t]].keys():
                try:
                    temp = temp + nipa_split[years[t]][a][industries[i]] * equip_expend_df2.loc[(industries[j],a), years[t]]
                except:
                    continue
                
            equip_matrix[t,i,j] = temp

final_equip_matrix = pd.DataFrame(equip_matrix.sum(axis = 0))
final_equip_matrix.columns = industries
final_equip_matrix.index = industries

In [194]:
px.imshow(final_equip_matrix, width=1200, height=1200)

In [154]:
nipa_split

{4: {'3340': 0.5736835752156513, '5415': 0.42631642478434867},
 5: {'3340': 0.8839221259700415,
  '3350': 0.0019062443602237864,
  '3364': 0.004917884858328821,
  '5130': 0.08060368164591229,
  '5412': 0.028650063165493594},
 6: {'3340': 0.4350266593358821,
  '3370': 0.004372898259791072,
  '3390': 0.5276630566814561,
  '5412': 0.03293738572287077},
 7: {'3340': 0.9753199868723335, '5412': 0.02468001312766656},
 8: {'3330': 0.9759121501948282, '5412': 0.024087849805171802},
 9: {'3330': 0.1907790143084261,
  '3340': 0.794912559618442,
  '3390': 0.0,
  '5412': 0.014308426073131956},
 11: {'2120': 0.0021675517502980383,
  '3320': 0.7705646472309526,
  '3330': 0.0020049853690256854,
  '3370': 0.0015714750189660778,
  '3250': 0.16549257613525523,
  '5412': 0.05819876449550233},
 12: {'3330': 0.9503433703116746, '5412': 0.04965662968832541},
 13: {'3330': 0.9387305472984787, '5412': 0.061269452701521246},
 14: {'3320': 0.0010681822762359675,
  '3330': 0.9166817824535946,
  '3260': 0.0055424