# Extract from database

This notebook contains all queries to the source database.

In [4]:
#Import modules
import pandas as pd

import dask.dataframe as dd
from dask.diagnostics import ProgressBar

from functions import chunk, agg, finalize
tunique = dd.Aggregation('tunique', chunk, agg,finalize)
first = dd.Aggregation('first', chunk, agg,finalize)

# %load_ext autoreload

General settings

In [5]:
drive_path = './../../../../../media/miglesia/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/'
# save_path = './../../../../../media/miglesia/Elements/export_france/data/processed/'
save_path = './'

colnames = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'DEPT', u'CN ID 8', u'CPA6',
       u'PYOD', u'PAYP', u'VAT', u'PRIFAC', u'DEVFAC', u'VFTE', u'VART', u'D_MASSE', u'MASSE', u'USUP', u'USUP_MT']
colname_no = dict(zip(colnames, range(18)))

def get_data(columns, drive_path, start_year = 1997, end_year = 2014):
    df_list = []
    for y in range(start_year, end_year):
        df_list += [dd.read_table(drive_path+'DP1610_MAASTRICHT1_'+str(y)+'.txt', 
                usecols = map(colname_no.get, columns),
                delimiter = ';', header = None, dtype = {9: 'object'})]
    data = dd.concat(df_list)
    data.columns = columns
    return data

## Build datasets
### - Price and quantities

In [58]:
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'CN ID 8', u'PYOD', u'VART', u'MASSE', u'USUP', u'USUP_MT']

data_ = get_data(columns, drive_path, end_year = 1999)

grouped = data_.loc[data_.FLUX == 2].groupby(['ID', 'CN ID 8', 'MONTH', 'YEAR'])

with ProgressBar():
    yearly_qv = grouped[['VART', 'MASSE']].sum().compute()
yearly_qv.to_csv(save_path + 'units_qv.csv')

# with ProgressBar():
#     yearly_details = data_.loc[data_.FLUX == 2].head(1000).groupby(['ID', 'CN ID 8', 'YEAR']).agg(
#         {'VART': sum, 'MASSE': sum, 'USUP': tunique, 'USUP': first, 'USUP_MT': sum}).compute()
# yearly_details.to_csv(save_path + 'units_detail.csv')

[########################################] | 100% Completed |  1min  7.2s


In [62]:
pd.read_csv(save_path + 'units_qv.csv').head()

Unnamed: 0,ID,CN ID 8,MONTH,YEAR,VART,MASSE
0,0,1029061,1,1997,7621,5400
1,0,1029079,1,1997,762,600
2,0,7051105,1,1997,1185,454
3,0,7052900,1,1997,2398,2828
4,0,7093000,1,1997,919,502


### - Firm sizes

In [29]:
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'VAT', u'VART']
data = get_data(columns, drive_path, end_year = 1999)

data['IMPORT'] = data['FLUX'] % 2

firm_sizes = data.groupby(['ID', 'IMPORT','YEAR'])[['VART']].sum().reset_index()
buyr_sizes = data.groupby(['VAT', 'IMPORT','YEAR'])[['VART']].sum().reset_index()

with ProgressBar():
    firm_sizes = firm_sizes.compute()
    buyr_sizes = buyr_sizes.compute()
    

firm_sizes.to_csv(save_path + 'firm_sizes_99.csv', index = False)
buyr_sizes.to_csv(save_path + 'buyr_sizes_99.csv', index = False)

[########################################] | 100% Completed | 52.5s
[########################################] | 100% Completed |  1min  9.3s


In [63]:
pd.read_csv(save_path + 'buyr_sizes_99.csv').head()

Unnamed: 0,VAT,IMPORT,YEAR,VART
0,AT0000769,0,1997,81678
1,AT0001269,0,1997,48169
2,AT0001271,0,1997,39296
3,AT0001620,0,1997,184006
4,AT0002530,0,1997,180568


### - Value of buyer-seller links

In [60]:
columns = [u'YEAR', u'FLUX', u'ID', u'VAT', u'VART']

data = get_data(columns, drive_path, end_year = 1999)

data['IMPORT'] = data['FLUX'] % 2

links = data.groupby(['IMPORT','YEAR','ID','VAT'])['VART'].sum().reset_index()

with ProgressBar():
    out = links.compute()
out.to_csv(save_path + 'buyer_seller_link_value.csv', index = False)

In [64]:
pd.read_csv(save_path + 'buyer_seller_link_value.csv').head()

Unnamed: 0,IMPORT,YEAR,ID,VAT,VART
0,0,1997,215,IT0018705,569748
1,0,1997,330,IT0224285,14459
2,0,1997,413,AT0026337,69955
3,0,1997,413,BE1043987,41595
4,0,1997,413,DE0161168,294


### - Sourcing info

In [None]:
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'CN ID 8', 'PYOD', u'VART']

data = get_data(columns, drive_path, end_year = 1999)

data['IMPORT'] = data['FLUX'] % 2
data['QUARTER'] = ((data['MONTH'] -1)// 3) + 1

CN_full = pd.read_csv('./../data/CN_full.csv', encoding = 'utf-8')
data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

# Compute and save
# sourcing_strategies = data.loc[data.IMPORT == 1].groupby(['YEAR', 'ID', 'CN ID 4', 'PYOD'])[['VART']].sum() #rm QUARTER for yearly dataset
# with ProgressBar():
#     out = sourcing_strategies.compute()
# out.to_csv(save_path + 'sourcing_strategies_99.csv')

# export_bundles = data.loc[data.IMPORT == 0].groupby(['YEAR', 'ID', 'CN ID 4', 'PYOD'])[['VART']].sum()
# with ProgressBar():
#     out2 = export_bundles.compute()
# out2.to_csv(save_path + 'export_bundles_99.csv')

# Compute and save
sourcing_strategies_qr = data.loc[data.IMPORT == 1].groupby(['YEAR', 'QUARTER','ID', 'CN ID 4', 'PYOD'])[['VART']].sum() #rm QUARTER for yearly dataset
with ProgressBar():
    out = sourcing_strategies_qr.compute()
out.to_csv(save_path + 'sourcing_strategies_99_qr.csv')

export_bundles_qr = data.loc[data.IMPORT == 0].groupby(['YEAR', 'QUARTER', 'ID', 'CN ID 4', 'PYOD'])[['VART']].sum()
with ProgressBar():
    out2 = export_bundles_qr.compute()
out2.to_csv(save_path + 'export_bundles_99_qr.csv')

In [79]:
pd.read_csv(save_path + 'export_bundles_99.csv').head()

Unnamed: 0,YEAR,ID,CN ID 4,PYOD,VART
0,1997,0,705,CH,71651
1,1997,0,709,CH,45022
2,1997,0,1806,JP,25972
3,1997,0,2204,BS,2389
4,1997,0,2204,CA,1512


### - Bernard's margins

In [1]:
#It's failing for some reason

# # columns = [u'YEAR', u'FLUX', u'ID', u'VAT', 'CN ID 8', 'PYOD', u'VART']
# columns = [u'YEAR', u'FLUX', u'ID', u'CN ID 8', 'PYOD', u'VART',  u'VAT']

# data = get_data(columns, drive_path, end_year = 1999)

# data['IMPORT'] = data['FLUX'] % 2

# # CN_full = pd.read_csv('./../data/CN_full.csv', encoding = 'utf-8')
# # data = data.merge(CN_full[['CN ID 8', 'CN ID 4']])#.persist()

# # margins_info = data.groupby(['IMPORT','YEAR','ID']).agg({'VAT': tunique, 'PYOD': tunique, 'CN ID 4': tunique, 'VART': sum})

# with ProgressBar():
#     out = data.compute()
# # out.to_csv(save_path + 'bernards_margins.csv')

In [6]:
data.head()#.compute()

Unnamed: 0,YEAR,FLUX,ID,CN ID 8,PYOD,VART,VAT,IMPORT,CN ID 4
0,1997,1,0,50050090,CH,,1398,1,5005
1,1997,1,955502745,50050090,IT,,1734,1,5005
2,1997,2,665750287,50050090,US,,1886,0,5005
3,1997,3,315093195,50050090,DE,,11960,1,5005
4,1997,3,317779197,50050090,ES,,3294,1,5005


### - Krammar's determinants of diversification

In [None]:
columns = [u'YEAR', u'FLUX', u'ID', u'CN ID 8', u'VAT', u'PYOD', u'VART']

data = get_data(columns, drive_path, end_year = 1999)
data['IMPORT'] = data['FLUX'] % 2

grouped = data.groupby(['ID', 'YEAR', 'IMPORT'])

with ProgressBar():
    df = grouped.agg({'VART': 'sum', u'VAT': tunique, 'CN ID 8': tunique, u'PYOD': tunique}).compute()

df.to_csv(save_path + 'dets_of_diversification.csv')

[#################                       ] | 43% Completed | 16min 35.7s

In [None]:
pd.read_csv(save_path + 'dets_of_diversification.csv').head()

### - Degree distribution

In [61]:
# window = 3
# assortativity_res = []
ID_degree_res = []
VAT_degree_res = []

for window in [1]:
    gap = (window-1)/2
    center_years = arange(1997, 2000, 2)
    print window

    for Yc in center_years:
        print Yc
        data_sec = data.loc[data.YEAR - Yc <= gap]
#         data_sec.groupby(['ID', 'VAT']).agg({'VART': sum })

        data_sec_by_ID = data_sec.groupby(['ID']).agg({'VAT': tunique, 'VART': sum})

        ID_degree = data_sec_by_ID[['VAT']].reset_index()
        ID_degree.columns = [u'ID', u'ID_degree']
        ID_degree['center_year'] = Yc
        ID_degree['window'] = window
        
        with ProgressBar():
            ID_deg = ID_degree.compute()
            ID_deg['bin'] = pd.cut(log10(ID_deg['ID_degree']), bins = arange(-.49, 5.99, .25))
            ID_deg.to_csv(save_path + 'ID_deg_'+str(Yc)+'_'+str(window)+'.csv', index = False)
#         ID_degree_res += [ID_degree]     

#         ID_deg = pd.read_csv()
        sampling = ID_deg.groupby(['bin'], observed = True).apply(lambda x: x.sample(200, replace = True))

        data_sec_sample = data_sec.loc[data_sec.ID.isin(sampling['ID'].values)]
        data_sec_by_VAT = data_sec_sample.groupby(['VAT']).agg({'ID': tunique, 'VART': sum})

        VAT_degree = data_sec_by_VAT[['ID']].reset_index()
        VAT_degree.columns = [u'VAT', u'VAT_degree']
        VAT_degree['center_year'] = Yc
        VAT_degree['window'] = window
        VAT_degree_res += [VAT_degree]
        with ProgressBar():
            VAT_deg = VAT_degree.compute()
            VAT_deg.to_csv(save_path + 'VAT_deg_'+str(Yc)+'_'+str(window)+'.csv', index = False)

1
1997
[########################################] | 100% Completed |  7min  3.7s
[########################################] | 100% Completed |  4min 30.6s
1999
[########################################] | 100% Completed | 14min  4.1s
[########################################] | 100% Completed |  8min  6.8s


In [66]:
pd.read_csv(save_path + 'ID_deg_'+str(Yc)+'_'+str(window)).head()

Unnamed: 0.1,Unnamed: 0,ID,ID_degree,center_year,window,bin
0,0,0,1,1999,1,"(-0.24, 0.01]"
1,1,18,1,1999,1,"(-0.24, 0.01]"
2,2,157,1,1999,1,"(-0.24, 0.01]"
3,3,215,5,1999,1,"(0.51, 0.76]"
4,4,223,1,1999,1,"(-0.24, 0.01]"


In [67]:
# fig, ax = plt.subplots(1)
# df_degrees.groupby('VAT_degree_bin')['ID_degree','VAT_degree'].quantile(.25).plot(x = 'VAT_degree', y = 'ID_degree', marker = '', ax = ax)
# df_degrees.groupby('VAT_degree_bin')['ID_degree','VAT_degree'].quantile(.5).plot(x = 'VAT_degree', y = 'ID_degree', marker = '', ax = ax)
# df_degrees.groupby('VAT_degree_bin')['ID_degree','VAT_degree'].quantile(.75).plot(x = 'VAT_degree', y = 'ID_degree', marker = '', ax = ax)

# # df_degrees.groupby('ID_nunique_bin')['VAT_nunique','ID_nunique'].mean().plot(x = 'ID_nunique', y = 'VAT_nunique', marker = 'o', ax = ax)
# df_degrees.groupby('ID_nunique')['VAT_nunique'].median().plot(x = 'index', y = 'VAT_nunique', marker = '.', linewidth = 0, ax = ax)
# ax.set_xscale('log')
# ax.set_yscale('log')

NameError: name 'df_degrees' is not defined

### - Buyers and sellers by foreign country

In [None]:
columns = [u'YEAR', u'FLUX', u'ID', u'PYOD', u'VART']
data = get_data(columns, drive_path, end_year = 1999)
data['IMPORT'] = data['FLUX'] % 2

data_by_dest = data.groupby(['IMPORT','YEAR','ID','PYOD'])['VART'].sum().reset_index()

result = data_by_dest.groupby(['PYOD', 'YEAR']).agg({'ID': tunique, 'VART': 'sum'})

with ProgressBar():
    out = result.compute()
    
out.to_csv(save_path + 'destination.csv')

# Older stuff

In [45]:
links = pd.read_csv(save_path + 'buyer_seller_link_value.csv')
links['PERIOD'] = (links['YEAR'] - 1996) // 2

In [47]:
degrees = links.groupby(['PERIOD', 'ID'])[['VAT']].nunique().rename(columns = {'VAT': 'ID_degree'})

In [50]:
from numpy import log10, arange
degrees['log_ID_degree'] = log10(degrees['ID_degree'])
degrees['bin_ID_degree'] = pd.cut(degrees['log_ID_degree'], arange(-.25, 4.5, 0.25))

In [51]:
degree_dist = degrees.reset_index().groupby(['PERIOD', 'bin_ID_degree'])[['ID']].count()

In [53]:
import matplotlib.pyplot as plt
fig, axs = plt.subplots(1, 2, figsize =(15, 6))

ax = axs[0]
for t in links['PERIOD'].unique():
    log10(degree_dist.loc[t]).reset_index().plot(marker = 'o', linewidth = 0, ax = ax, mec = 'None')