In [1]:
import pandas as pd
data_path = './../../data/processed/'

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df = pd.read_csv(data_path + 'export_bundles_qr.csv')
df = df.loc[df.YEAR < 2001]

### Growth rates

In [3]:
table_levels = df.groupby(['ID','YEAR'])[['VART']].sum().unstack()


In [4]:
grs = np.log10(table_levels).diff(1, axis = 1).stack().rename(columns = {'VART': 'yr_log_diff'}).reset_index()

# grs_gauss = grs.loc[(grs.yr_log_diff > grs.yr_log_diff.quantile(.33)) & (grs.yr_log_diff < grs.yr_log_diff.quantile(.67)) ]
# plt.hist(grs_gauss.yr_log_diff.values, 50)
# plt.yscale('log')
# plt.show()

# grs['Gaussian'] = (grs.yr_log_diff > grs.yr_log_diff.quantile(.33)) & (grs.yr_log_diff < grs.yr_log_diff.quantile(.67))
grs['Gaussian'] = (grs.yr_log_diff > -0.06) & (grs.yr_log_diff < 0.09)
# grs.loc[grs['Gaussian']]

grs['Category'] = pd.qcut(grs.yr_log_diff, 3, labels = ['Shrinkage', 'Fluctuation', 'Expansion'])

### Margins

In [5]:
F = df.groupby(['ID', 'YEAR', 'QUARTER'])['VART'].sum().unstack([-2, -1]).sort_index(axis = 1)
FP = df.groupby(['ID','CN ID 4', 'YEAR', 'QUARTER'])['VART'].sum().unstack([-2, -1]).sort_index(axis = 1)
FC = df.groupby(['ID','PYOD', 'YEAR', 'QUARTER'])['VART'].sum().unstack([-2, -1]).sort_index(axis = 1)
FPC = df.groupby(['ID','CN ID 4', 'PYOD', 'YEAR', 'QUARTER'])['VART'].sum().unstack([-2, -1]).sort_index(axis = 1)
FB = df.groupby(['ID','VAT', 'YEAR', 'QUARTER'])['VART'].sum().unstack([-2, -1]).sort_index(axis = 1)

In [6]:
# # b = df_.cumsum(1)
# # c1 = b.sub(b.mask(df_).ffill(1).fillna(0)).astype(int)
# df_ = FP > 0
# b = df_.cumsum(1)
# #how long since current active period started
# c1 = b.sub(b.mask(df_).ffill(1).fillna(0)).astype(int)
# b = (~df_).cumsum(1)
# #how long since current INactive period started
# c0 = -b.sub(b.mask(~df_).ffill(1).fillna(0)).astype(int)


In [7]:
import numpy as np

def get_diff(table):
    # Interpolation, but the inerpolate method gets stuck
    ffill = (table > 0).astype(int).replace(0, np.nan).ffill(1, limit = 3)#.astype(int)
    bfill = (1 - ffill.fillna(0)).replace(0, np.nan).bfill(1, limit = 3)

    # ffilled gaps times zero when closure is definitive
    ffill = ffill.fillna(0).astype(int)
    bfill = 1 - bfill.fillna(0).astype(int)

    diff = bfill.diff(4, axis = 1) #table

    return diff

In [8]:
diff = get_diff(FPC)
diff_FPC = diff.stack([-2, -1]).reset_index().rename(columns = {0: 'margin_FPC'}) #long

diff = get_diff(FB)
diff_FB = diff.stack([-2, -1]).reset_index().rename(columns = {0: 'margin_FB'}) #long

diff = get_diff(F)
diff_F = diff.stack([-2, -1]).reset_index().rename(columns = {0: 'margin_F'}) #long

diff = get_diff(FP)
diff_FP = diff.stack([-2, -1]).reset_index().rename(columns = {0: 'margin_FP'}) #long

diff = get_diff(FC)
diff_FC = diff.stack([-2, -1]).reset_index().rename(columns = {0: 'margin_FC'}) #long

In [9]:
diff_4Q = df.set_index(['ID','CN ID 4','PYOD','VAT', 'YEAR','QUARTER']).unstack([-2, -1]).sort_index(axis = 1)

In [10]:
diff_4Q = diff_4Q.fillna(0).diff(4, axis = 1) #table
diff_4Q = diff_4Q.replace(0, np.nan).stack([-2, -1]).reset_index().rename(columns = {'VART': 'VART_4Q_diff'}) #long

In [11]:
frac = .05
diff_4Q_margins = diff_4Q.sample(frac = frac).merge(
    diff_F, how = 'left').merge(diff_FP, how = 'left').merge(
    diff_FC, how = 'left').merge(diff_FPC, how = 'left').merge(diff_FB, how = 'left').merge(grs, how = 'left')

In [12]:
growth = diff_4Q_margins.loc[diff_4Q_margins.VART_4Q_diff > 0]
shrink = diff_4Q_margins.loc[diff_4Q_margins.VART_4Q_diff < 0]

In [13]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [14]:
margins_dec = pd.read_csv('./../../data/margins_desc.csv')

In [15]:
# s_list = []
for df in [shrink, growth]:
    grouped = df.groupby(['margin_F', 'margin_FP','margin_FC','margin_FPC', 'margin_FB', 'Category'])
    s = grouped['VART_4Q_diff'].sum()
    s = s/df.YEAR.nunique()/1e6/frac
    s = s.unstack().dropna()
    s = s.loc[s.sum(1).sort_values().index].round(-1)
    names = [name.replace('_', ' ') for name in s.index.names]
    s.index = pd.MultiIndex.from_tuples([tuple([int(i) for i in ix]) for ix in s.index.tolist()])
    s.index.names = names
#     s_list += [s]
    s = s.rename(columns=str).reset_index().merge(margins_dec, how = 'left').drop_duplicates().dropna()
    s = s.set_index(['margin F','margin FP','margin FC','margin FPC','margin FB'])
    display(s.style.bar(vmin = -3e4, vmax = 3e4, align='mid', color=['#d65f5f', '#5fba7d'], subset = ['Shrinkage','Fluctuation','Expansion']))
#     display(s)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Shrinkage,Fluctuation,Expansion,Description
margin F,margin FP,margin FC,margin FPC,margin FB,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,0,0,0,0,-9860,-28530,-2890,Continuing
0,0,0,0,-1,-2520,-5840,-1160,Exit of buyer
0,0,-1,-1,-1,-3460,-2010,-590,Exit from a country
0,0,0,-1,0,-1220,-2300,-390,Exit of country-product
0,-1,0,-1,0,-1060,-730,-300,Exit from product staying in country
0,-1,-1,-1,-1,-1530,-260,-180,Exit of product and country
-1,-1,-1,-1,-1,-660,-40,-30,Exit of firm
0,0,0,-1,-1,-410,-200,-70,Lost only buyer of the country-product
0,-1,0,-1,-1,-210,-110,-40,Exit of only buyer of the product


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Shrinkage,Fluctuation,Expansion,Description
margin F,margin FP,margin FC,margin FPC,margin FB,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,1,0,1,1,40,180,640,New product and buyer in continuing country
0,0,0,1,1,130,370,550,New country-product with new buyer
1,1,1,1,1,50,250,1820,New firm
0,1,0,1,0,430,1080,1460,New product in continuing buyer
0,1,1,1,1,180,370,3210,New product to a new country
0,0,0,1,0,620,3110,1150,New country-product with old buyer
0,0,1,1,1,750,6180,3600,New country
0,0,0,0,1,1230,8130,2340,New buyer
0,0,0,0,0,3270,32300,9730,Continuing


In [16]:
# Vast majority are continuing. It is easier to have the country changing, rather than the product.

In [17]:
# The gaussian / fat tail characteristic of them should be merged here!!! 

In [18]:
# for i in [0, 1]:   
#     df_level_g = df_level_g_list[i].sample(frac = .05)
#     df_level_g['annual_log_bin'] = pd.cut(df_level_g['annual_log'], np.arange(4, 10, .5))
    
#     # separate it into 2:
#     df_level_g_gauss = df_level_g.loc[abs(df_level_g.annual_logdiff - 0.015) < .075]