### Set up

In [1]:
#Import modules

import pandas as pd
pd.options.display.max_columns = 99
pd.options.display.max_rows = 999
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# from numpy import arange, log10

import matplotlib.pyplot as plt
%matplotlib inline

### Load data directly

In [None]:
CN_full = pd.read_csv('./../../data/CN_full.csv', encoding = 'utf-8')
out = pd.read_csv('./../../data/sourcing_strategies.csv').reset_index()
# out = pd.read_csv('./../data/export_bundles.csv')

# sources_count = out.groupby('ID')[['PYOD']].nunique()
# multisourcing_firms = sources_count[sources_count['PYOD'] > 1].index
# df = out.loc[out.ID.isin(multisourcing_firms)]

In [None]:
#Large countries
out.groupby('PYOD')['VART'].sum().sort_values().tail(10).index

In [None]:
# Main products
out.groupby(['CN ID 4'])['VART'].sum().sort_values().tail(30).index

In [None]:
from scipy.special import erfinv, erf
import numpy as np
# import matplotlib.mlab as mlab
# import math

# x = np.linspace(-3.5, 3.5, 100000)
# 1/mlab.normpdf(x, mu, sigma).max()
# np.sqrt(2*np.pi)

def erfinv_(x):
    return erfinv(2*(x - .5))

In [None]:
#Include firm sizes
firm_sizes = pd.read_csv('./../data/firm_sizes.csv')
fs = firm_sizes.loc[firm_sizes.IMPORT == 1].groupby(['ID', 'YEAR'])['VART'].sum().reset_index()
fs['log_M_size'] = np.log10(fs['VART'])


In [None]:
# Compute observed mu coefficients

fpcy = out.groupby(['ID', 'CN ID 4', 'PYOD', 'YEAR'])[['VART']].sum().reset_index()
totals_fpy = fpcy.groupby(['ID', 'CN ID 4', 'YEAR'])[['VART']].sum().reset_index().rename(columns = {'VART': 'VART_sum_fpy'})
totals_fy = fpcy.groupby(['ID', 'YEAR'])[['VART']].sum().reset_index().rename(columns = {'VART': 'VART_sum_fy'})
mu_coeffs = fpcy.merge(totals_fpy, on = ['YEAR', 'ID', 'CN ID 4']).merge(totals_fy, on = ['YEAR', 'ID'])
mu_coeffs['frac'] = mu_coeffs['VART']/mu_coeffs['VART_sum_fpy']
mu_coeffs['mu_fpcy'] = erfinv_(mu_coeffs['frac'])
mu_coeffs['mu_fpcy'] = np.clip(mu_coeffs['mu_fpcy'], -3, 3)
mu_coeffs = mu_coeffs.merge(fs[['YEAR', 'ID','log_M_size']], how = 'left')


In [None]:
XX
mu_coeffs.to_csv('./../data/processed/mu_coeffs.csv', index = False)

In [None]:
mu_coeffs.loc[(mu_coeffs.YEAR < 2008) & (mu_coeffs['CN ID 4'] < 2000) & (mu_coeffs['frac'] < 1)].sort_values(by = 'ID').head(20)

In [None]:
mu_coeffs['log_size_bin'] = pd.cut(mu_coeffs['log_M_size'], range(14))
mu_coeffs_finite = mu_coeffs.loc[(mu_coeffs.mu_fpcy > -3) & (mu_coeffs.mu_fpcy < 3)] 

In [None]:
# 60% of rows, summing 86% of volume if we keep cases where there are at least 2 countries to compare their relative importance.
# len(mu_coeffs_finite)/float(len(mu_coeffs))
# mu_coeffs_finite['VART'].sum()/float(mu_coeffs['VART'].sum())

## Load mu coeffs and use

In [None]:
mu_coeffs = pd.read_csv('./../data/processed/mu_coeffs.csv')

In [None]:
# import seaborn as sns
# cm = sns.light_palette("green", as_cmap=True)

# table = x.unstack()['mean'].sort_index(axis = 1).iloc[:, 100:120]
# s = table.style.background_gradient(cmap=cm)
# s

In [None]:
mu_coeffs = mu_coeffs.loc[(mu_coeffs.mu_fpcy > -3) & (mu_coeffs.mu_fpcy < 3)]

# mu_coeffs['FC'] = mu_coeffs['ID'].astype(str) + mu_coeffs['PYOD']
# mu_coeffs['CPY'] = mu_coeffs['PYOD'] + mu_coeffs['CN ID 4'].astype(str) + mu_coeffs['YEAR'].astype(str)
# mu_coeffs['CY'] = mu_coeffs['PYOD'] + mu_coeffs['YEAR'].astype(str)

# mu_mean = mu_coeffs.groupby(['FC', 'CPY'])[['mu_fpcy']].mean().mean()
mu_mean = -0.681732

### Firm country fixed effects can be obtained by demeaning

In [None]:
# # rows = 10000
rows = len(mu_coeffs)

# # Obtain CPY_tilde fixed effects
mu_coeffs['mu_fpcy']

# FC_mean = mu_coeffs.head(rows)[['ID', 'PYOD', 'VART_sum_fpy', 'mu_fpcy', 'log_M_size']].groupby(['ID', 'PYOD'])[['mu_fpcy']].mean()
# FC_mean = FC_mean.rename(columns = {'mu_fpcy': 'fc_mean'}).reset_index()

# weighted mean. It weighs so that 'small' products don't make the average mu so volatile. 
# Eg. I just add 9.99 USD and 1 cent to two countries, their mu will be weighed by 10USD/total imports of the firm in the year.
mu_coeffs['weight'] = mu_coeffs['VART_sum_fpy']/mu_coeffs['VART_sum_fy']
mu_coeffs['mu_fpcy_w'] = mu_coeffs['mu_fpcy']*mu_coeffs['weight']

# Summing 'weight' is like summing 1*weight...
FC_mean = mu_coeffs.head(rows)[['ID', 'PYOD', 'mu_fpcy','mu_fpcy_w', 'weight']].groupby(['ID', 'PYOD']).agg(
    {'mu_fpcy_w': sum, 'weight': sum, 'mu_fpcy': 'mean'})
# so, it's the denominator in the weighed avg.
FC_mean['mu_fpcy_w_avg'] = FC_mean['mu_fpcy_w']/FC_mean['weight']

FC_mean = FC_mean[['mu_fpcy_w_avg']].rename(columns = {'mu_fpcy_w_avg': 'fc_mean'}).reset_index()
mu_coeffs_1  = mu_coeffs.head(rows).merge(FC_mean[['ID', 'PYOD', 'fc_mean']])
mu_coeffs_1['mu_pcycy'] = mu_coeffs_1['mu_fpcy'] - mu_coeffs_1['fc_mean'] 

CPY_FE = mu_coeffs_1[['PYOD', 'CN ID 4', 'YEAR', 'mu_pcycy']].groupby(['PYOD', 'CN ID 4', 'YEAR'])[['mu_pcycy']].mean() + mu_mean#.values
CPY_FE = CPY_FE.reset_index()#.rename(columns = {'mu_pcycy': 'mu_pcy_tilde'})

In [None]:
# Obtain FC fixed effects

CPY_mean = mu_coeffs.head(rows).groupby(['CN ID 4', 'PYOD', 'YEAR'])[['mu_fpcy']].mean()
CPY_mean = CPY_mean.rename(columns = {'mu_fpcy': 'cpy_mean'}).reset_index()

mu_coeffs_2  = mu_coeffs.head(rows).merge(CPY_mean, on = ['CN ID 4', 'PYOD', 'YEAR'])
mu_coeffs_2['mu_fc'] = mu_coeffs_2['mu_fpcy'] - mu_coeffs_2['cpy_mean'] 

FC_FE = mu_coeffs_2[['ID', 'PYOD','mu_fc']].groupby(['ID', 'PYOD'])[['mu_fc']].mean()
FC_FE = FC_FE.reset_index()#.rename(columns = {'mu_pcycy': 'mu_pcy_tilde'})

In [None]:
# FE = mu_coeffs.head(rows).merge(FC_FE, on = ['ID', 'PYOD']).merge(CPY_FE, on = ['CN ID 4', 'PYOD', 'YEAR'])

In [None]:
#subtract mean residual, so that it is centered in zero and epsilons can be observed.
# FE['x'] = FE.mu_fc + FE.mu_pcycy
# FE['residual'] = FE.mu_fpcy - FE.x

# meanres = FE.groupby(['FC', 'CPY'])[['residual']].mean().mean()## is it needed??? mu coeffs differ from 'true values' by a constant. But if we make it be zero, we can have a measure of the epsilon...

# FE.head(10000).plot('x', 'mu_fpcy', marker = 'o', linewidth = 0, alpha = 0.02)

In [None]:
# mu_coeffs.head(rows).groupby(['FC', 'CPY'])[['mu_fpcy']].mean().unstack()

In [None]:
# Separate CY from CPY

CY_mean = CPY_FE.groupby(['PYOD', 'YEAR'])[['mu_pcycy']].mean()
mu_mean = CY_mean.mean()
CY_mean = CY_mean.reset_index().rename(columns = {'mu_pcycy': 'mu_cy'})
mu_coeffs_3 = CPY_FE.merge(CY_mean, on =['PYOD', 'YEAR'])
mu_coeffs_3['mu_pcy'] = mu_coeffs_3['mu_pcycy'] - mu_coeffs_3['mu_cy']
CP_FE = mu_coeffs_3.drop('mu_pcycy', axis = 1)

FE = mu_coeffs.head(rows).merge(FC_FE, on = ['ID', 'PYOD']).merge(CP_FE, on = ['CN ID 4', 'PYOD', 'YEAR'])

mu_mean = FE.groupby(['PYOD', 'YEAR'])[['mu_cy']].mean().mean()
FE['mu_mean'] = mu_mean['mu_cy']
FE['mu_cy'] = FE['mu_cy'] - FE['mu_mean']


In [None]:
FE['x'] = FE.mu_mean + FE.mu_fc + FE.mu_cy + FE.mu_pcy

FE['residual'] = FE.mu_fpcy - FE.x

In [None]:
FE.to_csv('./../data/processed/mu_FE_decomposition.csv', index = False)

In [None]:
# FE.loc[FE['VART_sum'] > 100000].sample(20)

In [None]:
fig, ax = plt.subplots(1, figsize = (8, 6))
FE.sample(10000).plot('x', 'mu_fpcy', marker = 'o', linewidth = 0, alpha = 0.04, ax = ax)
# ax.plot(x = arange(-1, 2), y = arange(-1, 2))
diag_line, = ax.plot(ax.get_xlim(), ax.get_ylim(), ls="--", c=".3")

# old

In [None]:
pr = 4202
prs = out.groupby(['CN ID 4'])['VART'].sum().sort_values().tail(30).index

def f(mu, *args):
    data, row = args[0], args[1]
    x_star = erfinv(data)[0]
    return erf(x_star - mu) - row

for pr in prs:
    print(CN_full.set_index('CN ID 4').loc[pr]['CN label 4'].drop_duplicates().values)
    product_df = out.loc[out['CN ID 4'] == pr]
    sample = product_df.loc[product_df.ID.isin(product_df.ID.sample(1))]

    sample = sample.groupby(['YEAR', 'PYOD'])['VART'].sum().reset_index()
    sample['pct'] = sample.groupby(['YEAR'])['VART'].apply(lambda x:
                                                     x / float(x.sum()))
    sample = sample[['YEAR', 'PYOD', 'VART', 'pct']].reset_index(drop = True)
    
    fraction_values = sample.set_index(['YEAR', 'PYOD'])[['pct']].unstack().fillna(0).mean()
    fv = fraction_values.sort_values(ascending = False)
    
    
#     res = fv.apply(lambda row: optimize.brentq(f, -1, 10, args=(fv - .5, row - .5)))

#     mu = 0; variance = 1/20.
#     sigma = math.sqrt(variance)
#     x = np.linspace(-3.5, 3.5, 1000)

#     fig, ax = plt.subplots(1, figsize = (10, 3))
#     try:

#         for i in range(8):

#                 lab = str(res.index.get_level_values(1)[i])+', '+str(100*fv.round(2)[i])
#                 mu = res[i]
#                 ax.plot(x,mlab.normpdf(x, mu, sigma), label = lab)
#         #         ax.plot(x, .5*(1 + erf(x - mu)), label = res.index.get_level_values(1)[i])
#                 if i == 0:
#                     sum_ = .5*(1 + erf(x - mu))
#                 else:
#                     sum_ += .5*(1 + erf(x - mu))

#         ax.plot(x, sum_, '--')

#     except:
#         pass
        
#     ax.set_ylim(0, 2)
#     ax.set_xlim(-2, 1)
    
#     plt.legend(loc = 'upper left')
#     plt.show()

### Show some examples of firm sourcing strategies in time

In [None]:
# df_.set_index('ID').loc[sample_ids]

In [None]:

sample_ids = df.sample(10)['ID'].values

#Select firms that import more than 10% of some product
# df_ = firm_prod.loc[(firm_prod['CN ID 4'] == 601) & (firm_prod['pct'] > 1)]
# sample_ids = df_.sample(10)['ID'].values

for id_ in sample_ids:
    df_ID = df.loc[df.ID == id_]
    
    by_product = df_ID.groupby('CN ID 4')['VART'].sum().sort_values(ascending = False)
    by_product/=by_product.sum()
    main_prods = by_product[by_product.cumsum() < .99].index
    df_ID_mp = df_ID.loc[df_ID['CN ID 4'].isin(main_prods)]
    
    table = df_ID_mp.set_index([u'CN ID 4', u'PYOD', u'QUARTER', u'YEAR'])['VART'].unstack([-1, -2]).fillna(0)
    table[('min','')] = 0; table[('max','')] = table.max().max()

    display(CN_full.set_index('CN ID 4')[['CN label 4']].drop_duplicates().loc[df_ID_mp['CN ID 4'].unique()].sort_index())

    display(table.T.style.bar(color='#d65f5f'))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(np.log10(firm_prod['pct']).replace(-np.inf, np.nan).dropna().values, 100)
plt.show()