In [1]:
import pandas as pd
import glob

In [115]:
cols = ['Year', 'Reporter Code', 'Reporter', 'Partner Code', 'Partner', 'Trade Value (US$)']
schema = {'Year': str, 'Reporter Code': str, 'Partner Code': str}
def dfs():
    for file in glob.glob('data/*.csv'):
        yield pd.read_csv(file, encoding='latin-1', usecols=cols, 
                          dtype=schema)
df = pd.concat(dfs())
df.dropna(inplace=True)
df.rename(columns={'Reporter Code': 'Reporter_Code', 'Partner Code': 'Partner_Code', 'Trade Value (US$)': 'Trade_Value'}, inplace=True)

In [132]:
reporters = set(df.Reporter_Code.unique())
print(len(reporters), "Source nodes are present. (Some have invalid data so dropped)")
df_ = df[df.Partner_Code.isin(reporters)]  #induced sub-graph creation
print("Filtere records from", len(df), "to", len(df_))

143 Source nodes are present. (Some have invalid data so dropped)
Filtere records from 74077 to 66316


In [121]:
# Filtered DF is essentially an induced graph now. Let's save this
df_.to_csv('filtered_df.csv', header=True)

In [None]:
countries = df_.Reporter.unique()
country_to_index = dict(zip(countries, range(len(countries))))
index_to_country = dict(map(reversed, country_to_index.items()));

In [173]:
# We can now start creating tensors
import numpy as np
# First dimension is for time (10 years)
# Second dimension is for TO
tensor = np.zeros((10, N, N))

In [174]:
for i, year in enumerate(['2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']):
    # Data of that particular year
    df__ = df_[df_.Year == year]
    # For the scale of this data, I think we can just iterate over it.
    for _, row in df__.iterrows():
        country_from = row['Partner']
        country_to = row['Reporter']
        tensor[i, country_to_index[country_from], country_to_index[country_to]] = row['Trade_Value']

In [175]:
sparsity = 1.0 - np.count_nonzero(tensor) / tensor.size
print(f"{sparsity*100:.2f}% of the tensor is sparse")

67.58% of the tensor is sparse


In [176]:
tensor

array([[[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 2.2651200e+05, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        ...,
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 3.5345950e+06],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 1.6782854e+07, 0.0000000e+00]],

       [[0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ...,
         0.0000000e+00, 0.0000000e+00, 0.0000000e+00],
        [0.0000000e+00, 1.5125220e+06, 0.0000000e+00, ...,
         0.000

In [168]:
!pip install tensorly

Collecting tensorly
  Downloading tensorly-0.7.0-py3-none-any.whl (198 kB)
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
Installing collected packages: nose, tensorly
Successfully installed nose-1.3.7 tensorly-0.7.0


In [177]:
# Normalizing our tensor. For the first version, let's just calculate the log
non_zeros = (tensor!=0)
tensor[non_zeros] = np.log(tensor[non_zeros])
tensor

array([[[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , 12.3305532 ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        , 15.07810928],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         16.63586833,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.        , 14.22928901,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ...,
        [ 0.        ,  0.        ,  0.        , ...,  

In [245]:
from tensorly.decomposition import parafac

# Can tinker with ranks
# Mask has a special significance, we should really discuss and explore more about it
weights, factors = parafac(tensor, rank=150, normalize_factors=True, mask=non_zeros, random_state=42)

def get_slice(A, B, T, W):
    # Given the two matrices, get the slice
    print("Shape of A", A.shape)
    print("Shape of B", B.shape)
    print("Shape of T", T.shape)
    print("Shape of W", W.shape)
    print ("Shape of np.diag(W*T)", np.diag(W*T).shape)
    return np.matmul(np.matmul(A, np.diag(W*T)), B.T)

sl = get_slice(factors[1], factors[2], factors[0][0], weights)
sl[non_zeros[0]]

Shape of A (143, 150)
Shape of B (143, 150)
Shape of T (150,)
Shape of W (150,)
Shape of np.diag(W*T) (150, 150)


array([ 6.03127935, 18.26250606, 12.36066673, ..., 16.11366776,
        9.5186101 , 16.40344908])

In [247]:
np.abs(tensor[0][non_zeros[0]] - sl[non_zeros[0]])

array([0.05322007, 0.35896177, 0.03118399, ..., 0.57954401, 0.09017892,
       0.23241925])

In [249]:
sorted(weights)

[137.39251625468327,
 161.67177631385053,
 166.7337774454309,
 179.1791473387217,
 179.50554036489348,
 181.39027839453132,
 182.70012840535136,
 189.14555475934338,
 191.52911719434852,
 193.66959994399187,
 196.24818911704196,
 197.80161868366523,
 200.70945401253252,
 201.11295879024217,
 201.78149235802553,
 202.1204402348674,
 202.541241430834,
 205.1790255626776,
 205.80586062153264,
 208.56791299820014,
 210.64061369936255,
 213.05286914525504,
 213.61710743854434,
 215.57384701906597,
 218.49618490407275,
 218.6661669950523,
 219.277845852132,
 220.11190359253047,
 221.1104126330611,
 222.3132436579153,
 223.64219523890932,
 227.30688430815886,
 229.15663478372892,
 230.7824504185294,
 233.6050637590476,
 236.08730048503884,
 240.39019238098317,
 240.83601482960458,
 241.80773327069025,
 243.5184396126036,
 244.50528089797425,
 245.04541880169873,
 247.16420188073155,
 247.30894563064123,
 247.88151782000406,
 250.44548171306826,
 254.36430937149467,
 255.1683344406849,
 255.97

Shape of A (143, 4)
Shape of B (143, 4)
Shape of T (4,)
Shape of W (4,)
Shape of np.diag(W*T) (4, 4)


array([13.04819786, 12.08905928,  7.82025492, ..., 16.10650448,
       12.51119898, 13.90951056])

array([ 6.08449941, 17.9035443 , 12.39185072, ..., 16.69321177,
        9.42843118, 16.63586833])