In [1]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn

data_path = './../../data/processed/'

import numpy as np

import warnings
warnings.filterwarnings('ignore')

import json

vardict = json.load(open("variables.json"))
def add_entry(vardict, name, element):
    vardict[name]= element
    return vardict

## Data

In [2]:
col = 'ID'
df = pd.read_csv(data_path + col.replace(' ', '_')+'_YM.csv')
df['date'] = pd.DatetimeIndex(pd.to_datetime(12*(df.YEAR - 1997) + df.MONTH, unit='M', origin=pd.Timestamp('1996-12-15'))).round('D')


In [3]:
df.head()

Unnamed: 0,ID,IMPORT,YEAR,MONTH,VART,date
0,0,0,1997,1,2003887,1997-01-01
1,0,1,1997,1,3212282,1997-01-01
2,215,0,1997,1,53758,1997-01-01
3,223,1,1997,1,29123,1997-01-01
4,330,0,1997,1,2870,1997-01-01


## Growth rates. 
### Levels and self dependence

In [None]:
for i in range(2):

    cuadro = df.loc[df.IMPORT == i].set_index(['ID', 'date'])[['VART']].unstack()['VART']

    # Firms present in at least two separate months
    cuadro = cuadro.loc[cuadro.count(1) > 6]

    # Fix the datetime index. (make sure it is an OK monthly index)
    start_yr = str(df.YEAR.min())
    cuadro.columns = pd.date_range(start='1/1/'+start_yr, periods=cuadro.shape[1], freq = 'M')
    cuadro_rolled = cuadro


    result = []

    for k, freq in enumerate(['Y', '6M', 'Q', 'M']):
        print(freq)

        if freq != 'M':
            levels = cuadro_rolled.T.groupby(pd.Grouper(level=0, freq=freq)).sum().T
            # Firms present more than one period
            levels = levels.loc[levels.count(1) > 1]
        else:
            levels = cuadro_rolled


        annual_logdiff = np.log10(levels).replace(-np.inf, np.nan).diff(axis = 1)
        annual_log = np.log10(levels.reindex(annual_logdiff.index))
        annual_logdiff = annual_logdiff.dropna(how = 'all').dropna(axis = 1, how = 'all')


        corrs = []
        n_periods = int((12*4)/[12, 6, 3, 1][k])
        for d in range(n_periods):
            if d == 0: corrs += [1]
        #     d = 1
            else:
                shifted = annual_logdiff.iloc[:, d:]
                original = annual_logdiff.iloc[:, :-d]
                shifted.columns = original.columns

                shifted = shifted.mask(original.isnull())
                original = original.mask(shifted.isnull())
                # Now, they have the nans in the exact same places.

                # Crop the now-empty rows and columns
                shifted = shifted.dropna(how = 'all').dropna(axis = 1, how = 'all')
                original = original.dropna(how = 'all').dropna(axis = 1, how = 'all')

                x = original.stack().values
                y = shifted.stack().values

                corrs += [np.corrcoef(x, y)[0, 1]]

        result += [corrs]

    with open('./autocorr_results_'+str(i)+'.txt', 'w') as f:
        for item in result:
            f.write("%s\n" % item)

Y
6M
Q
M


In [None]:
import ast

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (15, 5))

for i in range(2):
    ax = axs[i]

    result = [ast.literal_eval(line.rstrip('\n')) for line in open('./autocorr_results_'+str(i)+'.txt')]

    res_dfs = []
    for k, freq in enumerate(['Y', '6M', 'Q', 'M']):
        n_periods = int((12*4)/[12, 6, 3, 1][k])
        ix = pd.period_range(start='0001-01-01', freq=freq, periods = n_periods)
        res_dfs += [pd.DataFrame(result[k], index = ix, columns = ['autocorr'])]

    res_dfs[0].plot(ax = ax, marker = 'o')

    res_dfs[1].iloc[[i for i in range(len(res_dfs[1])) if i%2 == 0]].asfreq('M', how = 'start').plot(ax = ax, marker = '|')
    res_dfs[1].iloc[[i for i in range(len(res_dfs[1])) if i%2 == 1]].asfreq('M', how = 'start').plot(ax = ax, marker = '|')

    # plt.show()

    # fig, ax = plt.subplots(1, figsize = (7, 5))
    res_dfs[2].iloc[[i for i in range(len(res_dfs[2])) if i%4 == 0]].asfreq('M', how = 'start').plot(ax = ax, marker = 's')
    res_dfs[2].iloc[[i for i in range(len(res_dfs[2])) if i%4 == 1]].asfreq('M', how = 'start').plot(ax = ax, marker = 's')
    res_dfs[2].iloc[[i for i in range(len(res_dfs[2])) if i%4 == 2]].asfreq('M', how = 'start').plot(ax = ax, marker = 's')
    res_dfs[2].iloc[[i for i in range(len(res_dfs[2])) if i%4 == 3]].asfreq('M', how = 'start').plot(ax = ax, marker = 's')
    # plt.show()

    # fig, ax = plt.subplots(1, figsize = (7, 5))
    res_dfs[3].plot(ax = ax, marker = '.')

plt.show()

fig, axs = plt.subplots(1, 2, figsize = (15, 5))
for i in range(2):
    ax = axs[i]

    result = [ast.literal_eval(line.rstrip('\n')) for line in open('./autocorr_results_'+str(i)+'.txt')]

    res_dfs = []
    for k, freq in enumerate(['Y', '6M', 'Q', 'M']):
        n_periods = int((12*4)/[12, 6, 3, 1][k])
        ix = pd.period_range(start='0001-01-01', freq=freq, periods = n_periods)
        res_dfs += [pd.DataFrame(result[k], index = ix, columns = ['autocorr'])]

    # fig, ax = plt.subplots(1, figsize = (7, 5))
    res_dfs[3].plot(ax = ax, marker = '.', color = '.3')
    ax.set_ylim(-.5, .2)
    ax.axhline(0, c = '.5', lw = .5)

plt.show()

### Autocorrelation
Particular cases, autocorrelation after one year and after one month.

In [None]:
result_corrs = []
for i in range(2):
    cuadro = df.loc[df.IMPORT == i].set_index(['ID', 'date'])[['VART']].unstack()['VART']

    # Firms present in at least two separate months
    cuadro = cuadro.loc[cuadro.count(1) > 6]

    # Fix the datetime index. (make sure it is an OK monthly index)
    start_yr = str(df.YEAR.min())
    cuadro.columns = pd.date_range(start='1/1/'+start_yr, periods=cuadro.shape[1], freq = 'M')
    cuadro_rolled = cuadro

    for k, freq in enumerate(['Y', 'M']):
        print(freq)

        if freq != 'M':
            levels = cuadro_rolled.T.groupby(pd.Grouper(level=0, freq=freq)).sum().T.tail(10000)
            # Firms present more than one period
            levels = levels.loc[levels.count(1) > 1]
        else:
            levels = cuadro_rolled

        annual_logdiff = np.log10(levels).replace(-np.inf, np.nan).diff(axis = 1)
        annual_log = np.log10(levels.reindex(annual_logdiff.index))
        annual_logdiff = annual_logdiff.dropna(how = 'all').dropna(axis = 1, how = 'all')

        corrs = []
        n_periods = int((12*4)/[12, 6, 3, 1][k])
        d = 1
        
        shifted = annual_logdiff.iloc[:, d:]
        original = annual_logdiff.iloc[:, :-d]
        shifted.columns = original.columns

        shifted = shifted.mask(original.isnull())
        original = original.mask(shifted.isnull())
        # Now, they have the nans in the exact same places.

        # Crop the now-empty rows and columns
        shifted = shifted.dropna(how = 'all').dropna(axis = 1, how = 'all')
        original = original.dropna(how = 'all').dropna(axis = 1, how = 'all')

        x = original.stack().values
        y = shifted.stack().values

        corrs += [np.corrcoef(x, y)[0, 1]]
#         display(np.corrcoef(x, y))
        result_corrs += [[x, y]]

In [None]:
# result_corrs has 4 elements, Exports (Y, M). Imports (Y, M).

fig, axs = plt.subplots(2, 2, figsize = (8, 8))

for i in range(2):
    for k, freq in enumerate(['Y', 'M']):
        ax = axs[k][i]
        xy = result_corrs[2*i + k]
        
        A = np.array(xy).T
        plot_pts = A[np.random.randint(A.shape[0], size=30000), :]

        ax.plot(plot_pts[:, 0], plot_pts[:, 1], marker = '.', lw = 0, alpha = .02, c = '.3')
        
        ax.set_xlim(-5, 5)
        ax.set_ylim(-5, 5)
        ax.annotate(['YEAR', 'MONTH'][k]+', '+['exports', 'imports'][i], xy=(0, 3.5), ha="center")
        ax.annotate('corr: '+str(np.corrcoef(A.T)[0][1].round(3)), xy=(0, -4.5), ha="center")
        ax.set_xlabel('Growth at t')
        ax.set_ylabel('Growth at t + 1')

plt.show()
