# Extract from database

This notebook contains all queries to the source database.

In [2]:
#Import modules
import pandas as pd

import dask.dataframe as dd
from dask.diagnostics import ProgressBar

from functions import chunk, agg, finalize
tunique = dd.Aggregation('tunique', chunk, agg,finalize)
first = dd.Aggregation('first', chunk, agg,finalize)

# %load_ext autoreload
from IPython.display import display, HTML


General settings

In [3]:
drive_path = './../../../export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/'
# save_path = './../../../../../media/miglesia/Elements/export_france/data/processed/'
save_path = './../../data/processed/'

colnames = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'DEPT', u'CN ID 8', u'CPA6',
       u'PYOD', u'PAYP', u'VAT', u'PRIFAC', u'DEVFAC', u'VFTE', u'VART', u'D_MASSE', u'MASSE', u'USUP', u'USUP_MT']
colname_no = dict(zip(colnames, range(18)))

def get_data(columns, drive_path, start_year = 1997, end_year = 2014):
    df_list = []
    usecols = map(colname_no.get, columns)
    no_colname = {v: k for k, v in colname_no.items()}

    for y in range(start_year, end_year):
        df_list += [dd.read_table(drive_path+'DP1610_MAASTRICHT1_'+str(y)+'.txt', 
                usecols = usecols,
                delimiter = ';', header = None, dtype = {9: 'object'})]

    data = dd.concat(df_list)
    data.columns = [no_colname[k] for k in sorted(usecols)]
    
    data = data.loc[data.VART >= 1000] # This is to continue the effect of older thresholding after 2010

    # map LU and BE to XU
    if 'PYOD' in columns: data['PYOD'] = data['PYOD'].str.replace('BE', 'XU').str.replace('LU', 'XU')
    if 'PAYP' in columns: data['PAYP'] = data['PAYP'].str.replace('BE', 'XU').str.replace('LU', 'XU')
        
    data['IMPORT'] = data['FLUX'] % 2

    return data

In [4]:
# Product classification info

CN_full = pd.read_csv('./../../data/CN_full.csv', encoding = 'utf-8')

In [12]:
# print for latex

# df = get_data(colnames, drive_path, end_year = 1998)
# print(df.sample(frac = 0.000001).compute().to_latex())


## Build datasets
### - Price and quantities

In [13]:
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'CN ID 8', u'PYOD', u'VART', u'MASSE', u'USUP', u'USUP_MT']

data_ = get_data(columns, drive_path, end_year = 1999)

# grouped = data_.loc[data_.FLUX == 2].groupby(['ID', 'CN ID 8', 'MONTH', 'YEAR'])
grouped = data_.loc[data_.FLUX == 2].groupby(['ID', 'CN ID 8', 'YEAR'])

with ProgressBar():
    yearly_qv = grouped[['VART', 'MASSE']].sum().compute()
yearly_qv.to_csv(save_path + 'units_qv.csv')

[########################################] | 100% Completed |  1min 40.2s


In [15]:
# with ProgressBar():
#     yearly_details = data_.loc[data_.FLUX == 2].groupby(['ID', 'CN ID 8', 'YEAR']).agg(
#         {'VART': sum, 'MASSE': sum, 'USUP': tunique, 'USUP': first, 'USUP_MT': sum}).compute()
# yearly_details.to_csv(save_path + 'units_detail.csv')

[######################################  ] | 95% Completed | 14min  9.7s


ValueError: multiple levels only valid with MultiIndex

In [22]:
# data_.loc[data_.FLUX == 2].head(100).groupby(['ID', 'CN ID 8', 'YEAR'])[['MASSE', 'USUP', 'USUP_MT']].first()#.agg(
# #         {'VART': sum, 'MASSE': sum, 'USUP': tunique, 'USUP': first, 'USUP_MT': sum})

In [None]:
pd.read_csv(save_path + 'units_qv.csv').head()

Unnamed: 0,ID,CN ID 8,MONTH,YEAR,VART,MASSE
0,0,1029061,1,1997,7621,5400
1,0,1029079,1,1997,762,600
2,0,7051105,1,1997,1185,454
3,0,7052900,1,1997,2398,2828
4,0,7093000,1,1997,919,502


### - Location of transactions

In [25]:
columns = [u'YEAR', u'FLUX', u'ID', u'DEPT', 'CN ID 8', 'VART']
data = get_data(columns, drive_path, end_year = 1999)

data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

data = data.groupby(['ID', 'CN ID 4', 'YEAR', 'DEPT', 'IMPORT'])[['VART']].sum().reset_index()

with ProgressBar():
    df = data.compute()

df.to_csv(save_path + '_transactions_location.csv', index = False)

[########################################] | 100% Completed |  1min 27.3s


## - Product nomenclature comparison

In [28]:
columns = [u'YEAR', u'FLUX', 'CN ID 8', 'CPA6', 'VART']

data = get_data(columns, drive_path, end_year = 1999)

data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

data = data.groupby(['CN ID 8', 'CN ID 4', 'CPA6', 'IMPORT', 'YEAR'])[['VART']].sum().reset_index()

with ProgressBar():
    df = data.compute()

df.to_csv(save_path + 'product_compare.csv', index = False)

[######################                  ] | 55% Completed |  8min 21.2s


  args2 = [_get_recursive(dsk, k, cache) for k in args]


KeyboardInterrupt: 

## - Total sales by year, month, import

In [39]:
columns = [u'YEAR', 'MONTH', u'FLUX', u'VART']
data = get_data(columns, drive_path)

agg = data.groupby(['IMPORT','YEAR', 'MONTH'])[['VART']].sum().reset_index()
with ProgressBar():
    agg = agg.compute()
agg.to_csv(save_path +'_YM.csv', index = False)


PYOD
[########################################] | 100% Completed | 15min 11.5s


## - Agregate each of the variables by year, flux

In [29]:
for col in ['ID','CN ID 8', 'CPA6', 'VAT', 'DEPT', 'PYOD']:
    print(col)
    columns = [u'YEAR', u'FLUX', col, u'VART']
    data = get_data(columns, drive_path)

    agg = data.groupby([col, 'IMPORT','YEAR'])[['VART']].sum().reset_index()
    with ProgressBar():
        agg = agg.compute()
    agg.to_csv(save_path + col.replace(' ', '_')+'_Y.csv', index = False)
    
    if col == 'CN ID 8':
        data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

        agg = data.groupby(['CN ID 4', 'IMPORT', 'YEAR'])[['VART']].sum().reset_index()

        with ProgressBar():
            agg = agg.compute()
        agg.to_csv(save_path + 'CN ID 4'.replace(' ', '_')+'_Y.csv', index = False)


ID
[#################                       ] | 42% Completed |  6min  8.4s


KeyboardInterrupt: 

## - Agregate each of the variables by YEAR, QUARTER, flux

In [5]:
for col in ['ID']:#,'CN ID 8', 'CPA6', 'VAT', 'DEPT', 'PYOD']:
    print(col)
    columns = [u'YEAR', 'MONTH', u'FLUX', col, u'VART']
    data = get_data(columns, drive_path)

    data['QUARTER'] = ((data['MONTH'] -1)// 3) + 1

    agg = data.groupby([col, 'IMPORT','YEAR', 'QUARTER'])[['VART']].sum().reset_index()
    with ProgressBar():
        agg = agg.compute()
    agg.to_csv(save_path + col.replace(' ', '_')+'_YQ.csv', index = False)
    
    if col == 'CN ID 8':
        data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

        agg = data.groupby(['CN ID 4', 'IMPORT', 'YEAR', 'QUARTER'])[['VART']].sum().reset_index()

        with ProgressBar():
            agg = agg.compute()
        agg.to_csv(save_path + 'CN ID 4'.replace(' ', '_')+'_YQ.csv', index = False)


ID
[########################################] | 100% Completed | 15min 42.4s


In [6]:
save_path + col.replace(' ', '_')+'_QY.csv'

'./../../data/processed/ID_QY.csv'

## - Agregate each of the variables by year, month, flux

In [None]:

# for col in ['CN ID 8']:#, 'CPA6', 'ID', 'VAT', 'DEPT', 'PYOD']:

# #     col = 
    
#     columns = [u'YEAR', u'MONTH', u'FLUX', col, u'VART']
#     data = get_data(columns, drive_path)


# #     agg = data.groupby([col, 'IMPORT','YEAR', 'MONTH'])[['VART']].sum().reset_index()
# #     with ProgressBar():
# #         agg = agg.compute()
# #     agg.to_csv(save_path + col.replace(' ', '_')+'_YM.csv', index = False)
    
#     if col == 'CN ID 8':
        
#         data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

#         agg = data.groupby(['CN ID 4', 'IMPORT', 'YEAR', 'MONTH'])[['VART']].sum().reset_index()

#         with ProgressBar():
#             agg = agg.compute()
#         filename = save_path + 'CN ID 4'.replace(' ', '_')+'_YM.csv'; print(filename)
#         agg.to_csv(filename, index = False)


## - Agregate each of the variables by month, year, flux

In [30]:
for col in ['CN ID 8', 'CPA6', 'ID', 'VAT', 'DEPT', 'PYOD']:

    columns = [u'YEAR', 'MONTH', u'FLUX', col, u'VART']
    data = get_data(columns, drive_path, end_year = 1999)

    agg = data.groupby([col, 'IMPORT','YEAR', 'MONTH'])[['VART']].sum().reset_index()
    with ProgressBar():
        agg = agg.compute()
    agg.to_csv(save_path + col.replace(' ', '_')+'_YM.csv', index = False)
    
    if col == 'CN ID 8':
        data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

        agg = data.groupby(['CN ID 4', 'IMPORT', 'YEAR', 'MONTH'])[['VART']].sum().reset_index()

        with ProgressBar():
            agg = agg.compute()
        agg.to_csv(save_path + 'CN ID 4'.replace(' ', '_')+'_YM.csv', index = False)


[########################################] | 100% Completed |  1min 23.4s
[########################################] | 100% Completed |  1min 27.0s
[########################################] | 100% Completed |  1min 30.8s
[########################################] | 100% Completed |  1min 27.8s
[########################################] | 100% Completed |  1min 34.4s
[########################################] | 100% Completed |  1min 21.8s
[########################################] | 100% Completed |  1min 54.8s


## Number of unique products, sellers, buyers, destinations, monthly.

In [36]:
for col in ['CN ID 8', 'CPA6', 'ID', 'VAT', 'DEPT', 'PYOD']:

    columns = [u'YEAR', 'MONTH', u'FLUX', col, u'VART']
    data = get_data(columns, drive_path, end_year = 1999)

    agg = data.groupby(['IMPORT','YEAR', 'MONTH']).agg({col: tunique}).reset_index()
    with ProgressBar():
        agg = agg.compute()
    agg.to_csv(save_path + col.replace(' ', '_')+'_YM_nuq.csv', index = False)
    
    if col == 'CN ID 8':
        data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

        agg = data.groupby(['IMPORT', 'YEAR', 'MONTH']).agg({'CN ID 4': tunique}).reset_index()

        with ProgressBar():
            agg = agg.compute()
        agg.to_csv(save_path + 'CN ID 4'.replace(' ', '_')+'_YM_nuq.csv', index = False)


[########################################] | 100% Completed |  1min 30.8s
[########################################] | 100% Completed |  1min 36.7s
[########################################] | 100% Completed |  1min 38.9s
[########################################] | 100% Completed |  1min 31.7s
[########################################] | 100% Completed |  1min 48.1s
[########################################] | 100% Completed |  1min 30.9s
[########################################] | 100% Completed |  1min 58.3s


## total yearly firm level sales by product, origin and location
Number of products, destinations, buyers and FR places per firm-year

In [7]:
# for col in ['CN ID 8', 'CPA6', 'PYOD', 'VAT', 'DEPT']:
for col in ['PYOD']:

    columns = ['ID', u'YEAR', u'FLUX', col, u'VART']
    data = get_data(columns, drive_path, start_year = 1997, end_year = 2014)
    
    agg = data.groupby(['ID','YEAR', 'IMPORT', col])[['VART']].sum().reset_index()
    with ProgressBar():
        agg = agg.compute()
    agg.to_csv(save_path + col.replace(' ', '_')+'_FY.csv', index = False)
    
    if col == 'CN ID 8':
        data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

        agg = data.groupby(['ID','YEAR', 'IMPORT', 'CN ID 4'])[['VART']].sum().reset_index()

        with ProgressBar():
            agg = agg.compute()
        agg.to_csv(save_path + 'CN ID 4'.replace(' ', '_')+'_FY.csv', index = False)
        

[########################################] | 100% Completed | 20min 23.2s


In [6]:
# save_path + col.replace(' ', '_')+'_FY.csv'

'./../../data/processed/PYOD_FY.csv'

### - Firm sizes

In [9]:
# Not necessary, there is ID_Y and VAT_Y

# columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'VAT', u'VART']
# data = get_data(columns, drive_path, end_year = 2014)

# firm_sizes = data.groupby(['ID', 'IMPORT','YEAR'])[['VART']].sum().reset_index()
# buyr_sizes = data.groupby(['VAT', 'IMPORT','YEAR'])[['VART']].sum().reset_index()

# with ProgressBar():
#     firm_sizes = firm_sizes.compute()
#     buyr_sizes = buyr_sizes.compute()
    

# firm_sizes.to_csv(save_path + 'firm_sizes.csv', index = False)
# buyr_sizes.to_csv(save_path + 'buyr_sizes.csv', index = False)

[########################################] | 100% Completed | 11min 12.0s
[########################################] | 100% Completed | 16min 56.9s


## Dataset total (pool all years)

In [3]:
# Promedio anual de un desagregado por producto, id, vat, etc. No se si es necesario..


# columns = [u'FLUX', 'YEAR','CN ID 8', 'CPA6', 'PYOD', 'VAT', u'ID', u'VART']
# data = get_data(columns, drive_path, start_year = 2009)

# data['VAT'] = data['VAT'].fillna(data['PYOD']) # out of EU replace VAT with country.

# data = data.merge(CN_full[['CN ID 8', 'CN ID 4']])#.persist()
        
# # data = data.groupby(['ID', 'IMPORT','CN ID 8', 'CN ID 4', 'CPA6', 'PYOD', 'VAT'])[['VART']].sum().reset_index()

# # Records within each year are summed
# yearly = data.groupby(['ID', 'IMPORT', 'CN ID 4','CPA6', 'CN ID 8', 'PYOD', 'VAT', 'YEAR'])[['VART']].sum().reset_index()
# yearly['PERIOD'] = (yearly.YEAR - 1997)//6

# # Yearly mean, in 3 periods, to relieve memory
# data = yearly.groupby(['ID', 'IMPORT', 'CN ID 4','CPA6', 'CN ID 8','PYOD', 'VAT', 'PERIOD'])[['VART']].mean().reset_index()

# with ProgressBar():
#     data = data.compute()
    
# data.to_csv(save_path + '_FBCP.csv', index = False)

[########################################] | 100% Completed | 13min 29.0s


  args2 = [_get_recursive(dsk, k, cache) for k in args]


### - Value of buyer-seller links

In [6]:
columns = [u'YEAR', u'FLUX', u'ID', u'VAT', u'VART']

data = get_data(columns, drive_path, start_year = 2005, end_year = 2014)
data['IMPORT'] = data['FLUX'] % 2

links = data.dropna().groupby(['IMPORT','YEAR','ID','VAT'])['VART'].sum().reset_index()

with ProgressBar():
    out = links.compute()
    
# out.to_csv(save_path + 'buyer_seller_link_value.csv', index = False)
# NAME CHANGE
out.to_csv(save_path + 'ID_VAT_Y.csv', index = False)

[                                        ] | 0% Completed |  6.6s


KeyboardInterrupt: 

In [37]:
save_path

'./../../data/processed/'

In [None]:
# pd.read_csv(save_path + 'buyer_seller_link_value.csv').head()

### - Sourcing info (QUARTER)

In [3]:
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'CN ID 8', 'PYOD', 'VAT', u'VART']

data = get_data(columns, drive_path, end_year = 2004)
data['VAT'] = data['VAT'].fillna(data['PYOD'])
data['QUARTER'] = ((data['MONTH'] -1)// 3) + 1

data = data.merge(CN_full[['CN ID 8', 'CN ID 4']])#.persist()

# Compute and save
sourcing_strategies_qr = data.loc[data.IMPORT == 1].groupby(['YEAR', 'QUARTER','ID', 'CN ID 4', 'PYOD'])[['VART']].sum() #rm QUARTER for yearly dataset
with ProgressBar():
    out = sourcing_strategies_qr.compute()
out.to_csv(save_path + 'sourcing_strategies_qr.csv')
out.to_csv(save_path + '_FCPYq.csv')

export_bundles_qr = data.loc[data.IMPORT == 0].groupby(['YEAR', 'QUARTER', 'ID', 'CN ID 4', 'PYOD', 'VAT'])[['VART']].sum()
with ProgressBar():
    out2 = export_bundles_qr.compute()
out2.to_csv(save_path + 'export_bundles_qr.csv')

[########################################] | 100% Completed |  6min 53.7s
[########################################] | 100% Completed |  7min 37.7s


In [None]:
# pd.read_csv(save_path + 'sourcing_strategies_qr.csv').head()

### - Sourcing info (YEARS)

In [38]:
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'CN ID 8', 'PYOD', u'VART']

data = get_data(columns, drive_path, end_year = 2014)

data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])#.persist()

# Compute and save
sourcing_strategies = data.groupby(['IMPORT', 'YEAR', 'ID', 'CN ID 4', 'PYOD'])[['VART']].sum() #rm QUARTER for yearly dataset
with ProgressBar():
    out = sourcing_strategies.compute()
out.to_csv(save_path + '_FCPY.csv')


[################################        ] | 81% Completed | 16min 56.4s


KeyboardInterrupt: 

### - Sourcing info (many years)

In [None]:
# Compute and save (many year period). First compute sum in year, then average multiple years

df = dd.read_csv('./../../data/processed/_FCPY.csv')

d = 6
df['PERIOD'] = (df.YEAR - 1997)//d
df['PERIOD_str'] = '-'.join([str(1997 + 6*df.PERIOD), 
                               str(1997 + d*df.PERIOD + (d - 1))])

# df = df.groupby(['IMPORT', 'ID', 'CN ID 4', 'PYOD', 'PERIOD'])[['VART']].mean().reset_index()
with ProgressBar():
    df = df.groupby(['IMPORT', 'ID', 'CN ID 4', 'PYOD', 'PERIOD'])[['VART']].mean().reset_index().compute()

df.to_csv(save_path + '_FCPp'+str(d)+'Y.csv')

### - Basket of products

In [5]:
# # Reusndant. these are already built for effective diversification.
# columns = [u'YEAR', u'FLUX', u'ID', u'CN ID 8', 'CPA6', 'VART']

# data = get_data(columns, drive_path, start_year = 2012, end_year = 2014)

# data['IMPORT'] = data['FLUX'] % 2

# CN_full = pd.read_csv('./../../data/CN_full.csv', encoding = 'utf-8')
# data = data.merge(CN_full[['CN ID 8', 'CN ID 4']])#.persist()

# # margins_info = data.groupby(['IMPORT','YEAR','ID']).agg({'VAT': tunique, 'PYOD': tunique, 'CN ID 4': tunique, 'VART': sum})

# for col in ['CN ID 8', 'CN ID 4', 'CPA6']:
#     agg = data.groupby(['ID','YEAR', 'IMPORT', col])[['VART']].sum().reset_index()

#     with ProgressBar():
#         agg = agg.compute()
#     agg.to_csv(save_path + col.replace(' ', '_')+'_FYP.csv', index = False)

[########################################] | 100% Completed |  2min 36.9s
[                                        ] | 0% Completed |  6.6s


  args2 = [_get_recursive(dsk, k, cache) for k in args]


KeyboardInterrupt: 

### - Bernard's margins

In [None]:
# It's failing for some reason

# columns = [u'YEAR', u'FLUX', u'ID', u'VAT', 'CN ID 8', 'PYOD', u'VART']
columns = [u'YEAR', u'FLUX', u'ID', u'CN ID 8', 'PYOD',  u'VAT', u'VART']

data = get_data(columns, drive_path, end_year = 2014)
data = data.loc[data.VART >= 1000] # This is to continue the effect of older thresholding after 2010


data['IMPORT'] = data['FLUX'] % 2

CN_full = pd.read_csv('./../data/CN_full.csv', encoding = 'utf-8')
data = data.merge(CN_full[['CN ID 8', 'CN ID 4']])#.persist()

# margins_info = data.groupby(['IMPORT','YEAR','ID']).agg({'VAT': tunique, 'PYOD': tunique, 'CN ID 4': tunique, 'VART': sum})
data = data.loc[data.FLUX == 4].groupby(['IMPORT', 'YEAR', 'ID', 'VAT', 'CN ID 8'])['VART'].sum().reset_index()

with ProgressBar():
    out = data.compute()
out.to_csv(save_path + 'bernards_margins_info.csv')

[######################################  ] | 97% Completed | 15min 10.3s

In [None]:
data.loc[data.FLUX == 4].head()#.compute()

### - Krammar's determinants of diversification

In [None]:
columns = [u'YEAR', u'FLUX', u'ID', u'CN ID 8', u'PYOD', u'VAT', u'VART']

data = get_data(columns, drive_path, end_year = 2014)
data = data.loc[data.VART >= 1000] # This is to continue the effect of older thresholding after 2010

data['IMPORT'] = data['FLUX'] % 2

data = data.loc[data.FLUX == 4]

grouped = data.groupby(['ID', 'YEAR', 'IMPORT'])

with ProgressBar():
    df = grouped.agg({'VART': 'sum','CN ID 8': tunique, u'PYOD': tunique, u'VAT': tunique}).compute()

df.to_csv(save_path + 'dets_of_diversification.csv')

[########################################] | 100% Completed |  3hr  1min 18.2s


In [None]:
pd.read_csv(save_path + 'dets_of_diversification.csv').head()

### - Buyers and sellers by foreign country

In [None]:
columns = [u'YEAR', u'FLUX', u'ID', u'PYOD', u'VART']
data = get_data(columns, drive_path, end_year = 2014)
data = data.loc[data.VART >= 1000] # This is to continue the effect of older thresholding after 2010

data['IMPORT'] = data['FLUX'] % 2

data_by_dest = data.groupby(['IMPORT','YEAR','ID','PYOD'])['VART'].sum().reset_index()

result = data_by_dest.groupby(['PYOD', 'YEAR']).agg({'ID': tunique, 'VART': 'sum'})

with ProgressBar():
    out = result.compute()
    
out.to_csv(save_path + 'destination.csv')

[                                        ] | 0% Completed |  0.0s[                                        ] | 0% Completed |  0.1s[                                        ] | 0% Completed |  0.2s

### - Degree distribution

In [None]:
# window = 3
# assortativity_res = []
ID_degree_res = []
VAT_degree_res = []

for window in [1, 3]:
    gap = (window-1)/2
    center_years = arange(1997, 2014, 2)
    print window

    for Yc in center_years:
        print Yc
        data_sec = data.loc[data.YEAR - Yc <= gap]
#         data_sec.groupby(['ID', 'VAT']).agg({'VART': sum })

        data_sec_by_ID = data_sec.groupby(['ID']).agg({'VAT': tunique, 'VART': sum})

        ID_degree = data_sec_by_ID[['VAT']].reset_index()
        ID_degree.columns = [u'ID', u'ID_degree']
        ID_degree['center_year'] = Yc
        ID_degree['window'] = window
        
        with ProgressBar():
            ID_deg = ID_degree.compute()
            ID_deg['bin'] = pd.cut(log10(ID_deg['ID_degree']), bins = arange(-.49, 5.99, .25))
            ID_deg.to_csv(save_path + 'ID_deg_'+str(Yc)+'_'+str(window)+'.csv', index = False)
#         ID_degree_res += [ID_degree]     

#         ID_deg = pd.read_csv()
        sampling = ID_deg.groupby(['bin'], observed = True).apply(lambda x: x.sample(200, replace = True))

        data_sec_sample = data_sec.loc[data_sec.ID.isin(sampling['ID'].values)]
        data_sec_by_VAT = data_sec_sample.groupby(['VAT']).agg({'ID': tunique, 'VART': sum})

        VAT_degree = data_sec_by_VAT[['ID']].reset_index()
        VAT_degree.columns = [u'VAT', u'VAT_degree']
        VAT_degree['center_year'] = Yc
        VAT_degree['window'] = window
        VAT_degree_res += [VAT_degree]
        with ProgressBar():
            VAT_deg = VAT_degree.compute()
            VAT_deg.to_csv(save_path + 'VAT_deg_'+str(Yc)+'_'+str(window)+'.csv', index = False)

In [None]:
pd.read_csv(save_path + 'ID_deg_'+str(Yc)+'_'+str(window)).head()

In [None]:
# fig, ax = plt.subplots(1)
# df_degrees.groupby('VAT_degree_bin')['ID_degree','VAT_degree'].quantile(.25).plot(x = 'VAT_degree', y = 'ID_degree', marker = '', ax = ax)
# df_degrees.groupby('VAT_degree_bin')['ID_degree','VAT_degree'].quantile(.5).plot(x = 'VAT_degree', y = 'ID_degree', marker = '', ax = ax)
# df_degrees.groupby('VAT_degree_bin')['ID_degree','VAT_degree'].quantile(.75).plot(x = 'VAT_degree', y = 'ID_degree', marker = '', ax = ax)

# # df_degrees.groupby('ID_nunique_bin')['VAT_nunique','ID_nunique'].mean().plot(x = 'ID_nunique', y = 'VAT_nunique', marker = 'o', ax = ax)
# df_degrees.groupby('ID_nunique')['VAT_nunique'].median().plot(x = 'index', y = 'VAT_nunique', marker = '.', linewidth = 0, ax = ax)
# ax.set_xscale('log')
# ax.set_yscale('log')

# Older stuff

In [None]:
# links = pd.read_csv(save_path + 'buyer_seller_link_value.csv')
# links['PERIOD'] = (links['YEAR'] - 1996) // 2

In [None]:
# degrees = links.groupby(['PERIOD', 'ID'])[['VAT']].nunique().rename(columns = {'VAT': 'ID_degree'})

In [None]:
# from numpy import log10, arange
# degrees['log_ID_degree'] = log10(degrees['ID_degree'])
# degrees['bin_ID_degree'] = pd.cut(degrees['log_ID_degree'], arange(-.25, 4.5, 0.25))

In [None]:
# degree_dist = degrees.reset_index().groupby(['PERIOD', 'bin_ID_degree'])[['ID']].count()

In [None]:
# import matplotlib.pyplot as plt
# fig, axs = plt.subplots(1, 2, figsize =(15, 6))

# ax = axs[0]
# for t in links['PERIOD'].unique():
#     log10(degree_dist.loc[t]).reset_index().plot(marker = 'o', linewidth = 0, ax = ax, mec = 'None')