### Set up

In [1]:
#Import modules

import pandas as pd
pd.options.display.max_columns = 99
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# from numpy import arange, log10

from functions import chunk, agg, finalize
tunique = dd.Aggregation('tunique', chunk, agg,finalize)
first = dd.Aggregation('first', chunk, agg,finalize)

## Sourcing info
### Arrange data

In [3]:
drive_path = './../../../../../../media/miglesia/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013'

colnames = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'DEPT', u'CN ID 8', u'CPA6',
       u'PYOD', u'PAYP', u'VAT', u'PRIFAC', u'DEVFAC', u'VFTE', u'VART', u'D_MASSE', u'MASSE', u'USUP', u'USUP_MT']

colname_no = dict(zip(colnames, range(18)))
# columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'VART']
columns = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'CN ID 8', 'PYOD', u'VART']

df_list = []

for y in range(2007, 2014):
    df_list += [dd.read_table(drive_path+'/DP1610_MAASTRICHT1_'+str(y)+'.txt', 
            usecols = map(colname_no.get, columns),
            delimiter = ';', header = None, dtype = {9: 'object'})]

data = dd.concat(df_list)
data.columns = columns
data['IMPORT'] = data['FLUX'] % 2
data['QUARTER'] = (data['MONTH'] // 3) + 1

CN_full = pd.read_csv('./../data/CN_full.csv', encoding = 'utf-8')
data = data.merge(CN_full[['CN ID 8', 'CN ID 4', 'CN label 4']])

sourcing_strategies = data.loc[data.IMPORT == 1].groupby(['YEAR', 'QUARTER', 'ID', 'CN ID 4', 'PYOD'])[['VART']].sum()

## Compute and save

with ProgressBar():
    out = sourcing_strategies.compute()

out.to_csv('./../data/sourcing_strategies.csv')

[########################################] | 100% Completed |  1min 22.0s


### Load data directly

In [None]:
out = pd.read_csv('./../data/sourcing_strategies.csv')
out_ri = out.reset_index()
# product_count = out_ri.groupby('ID')[['CN ID 4']].nunique()
sources_count = out_ri.groupby('ID')[['PYOD']].nunique()
multisourcing_firms = sources_count[sources_count['PYOD'] > 1].index
df = out_ri.loc[out_ri.ID.isin(multisourcing_firms)]

In [None]:
len(out_ri)

### Show some examples of firm sourcing strategies in time

In [None]:

# sample_ids = df.sample(10)['ID'].values
# for id_ in sample_ids:
#     df_ID = df.loc[df.ID == id_]
    
#     by_product = df_ID.groupby('CN ID 4')['VART'].sum().sort_values(ascending = False)
#     by_product/=by_product.sum()
#     main_prods = by_product[by_product.cumsum() < .99].index
#     df_ID_mp = df_ID.loc[df_ID['CN ID 4'].isin(main_prods)]
    
#     table = df_ID_mp.set_index([u'CN ID 4', u'PYOD',u'MONTH', u'YEAR'])['VART'].unstack([-1, -2]).fillna(0)
#     table[('min','')] = 0; table[('max','')] = table.max().max()

#     display(CN_full.set_index('CN ID 4')[['CN label 4']].drop_duplicates().loc[df_ID_mp['CN ID 4'].unique()].sort_index())

#     display(table.T.style.bar(color='#d65f5f'))