# Data reading and sampling

In [1]:
import pandas as pd
import numpy as np
import time

### Useful functions for chunk filtering and aggregation

In [2]:

path = './../../../../media/miglesia/Elements/export_france/data/'
path1 = path+'type1/DP1610_MAASTRICHT1_1997_2013'
path2 = path+'type2/DP1611_MAASTRICHT2_1997_2013'

# Writing as a function

def aggregation(chunk, index, func):
    grouped_object = chunk.groupby(index,sort = False)
    # not sorting results in a minor speedup
    return grouped_object.agg(func)
                                
colnames = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'DEPT', u'CN ID 8', u'CPA6',
       u'PYOD', u'PAYP', u'VAT', u'PRIFAC', u'DEVFAC', u'VFTE', u'VART', u'D_MASSE', u'MASSE', u'USUP', u'USUP_MT']

colname_no = dict(zip(colnames, range(18)))
colno_name = dict(zip(range(18), colnames))

#We need to know number of rows of each file:

l = []
with open(path+'/type1_row_ct') as f:
    for line in f:
        l += [int(line.strip())]
nrows1 = dict(zip(range(1997, 2014), l))

l = []
with open(path+'/type2_row_ct') as f:
    for line in f:
        l += [int(line.strip())]
nrows2 = dict(zip(range(1997, 2014), l))

def merge_ref(df, filter_ref, filter_ix):
    return df.reset_index().merge(filter_ref, on = filter_ix, how = 'left').set_index('index')
     
def select_entries(df, filter_col, value, by_df = None):
    if by_df is None:
        by_df = df
        
    if isinstance(value, (list,)):
        return df.loc[by_df[filter_col].isin(value)]
    else:
        return df.loc[by_df[filter_col] == value]

IOError: [Errno 2] No such file or directory: './../../../../media/miglesia/Elements/export_france/data//type1_row_ct'

### Load and process references for filtering (eg. data on firm sizes to filter by size)

In [None]:

sizes_index = pd.read_csv('./../export_france/data/formatted/sizes_index.csv')

n_bins = 20

sizes_index['log_exp_mma'] = np.log10(sizes_index.exp_mma).round(3)
cuts = pd.cut(sizes_index.groupby(sizes_index['log_exp_mma']).sum().cumsum()['exp_mma'],n_bins, labels=range(n_bins))
exp_index = sizes_index.merge(cuts.reset_index().rename({'exp_mma': 'exp_mma_cat'}, axis = 1), on = 'log_exp_mma').dropna(subset = ['exp_mma'])

exp_size_ref = exp_index[['ID', 'exp_mma_cat']]; filter_ix = 'ID'; filter_col = 'exp_mma_cat';

In [None]:
colname_no

### Main loop for reading from hard drive and saving

In [None]:
# names = tuple(index)
# sampling_name = name

years = list(reversed(range(1997, 2014)))
n_chunks = 5
data_path = path+'type1/DP1610_MAASTRICHT1_1997_2013'
columns = ['YEAR', 'MONTH', 'FLUX', 'ID', 'CN ID 8', 'VAT','VART']
aggregate_chunks = True
chunk_agg_func = {'VART':['sum']}
aggregate_years = True
yr_agg_func = {'VART_sum':['sum']}
verbose = True
output_index = ['YEAR', 'MONTH', 'IMPORT', 'ID', 'CN ID 8', 'VAT']

save_yr_agg_result = True
save_path = path1+'/samplings'

for bin_ in list(reversed(range(5))): #n_bins
    print 'bin no. '+str(bin_)
    sampling_name = 'YMxpb_size'+str(n_bins).zfill(2)+str(bin_).zfill(2)
    l = []

    for y in years:
        print 'year: '+str(y)
        #Rows for each chunk
        size = np.ceil(nrows1[y]/n_chunks)
        print 'max_chunk_size: '+str(size)

        reader = pd.read_csv(data_path+'/DP1610_MAASTRICHT1_'+str(y)+'.txt', chunksize = size, 
                             usecols = map(colname_no.get, columns)
                             , delimiter = ';', header = None) #'CN ID 8', 'PYOD'

        yr_result_list = []

        #Loop over chunks
        for i in range(int(n_chunks)):
            start_time = time.time()
            chunk = next(reader).rename(colno_name, axis = 1)
            chunk['IMPORT'] = chunk['FLUX'] % 2 # so that '1' indicates imports

            """
            Chunk filtering
            """
            #select imports/exports
            chunk = select_entries(chunk, filter_col = 'IMPORT', value = 0)

            # select by firm size
            chunk_ = merge_ref(chunk, exp_size_ref, filter_ix = 'ID')
            chunk = select_entries(chunk, filter_col = 'exp_mma_cat', value = bin_, by_df = chunk_)

            """
            Chunk aggregation
            """
            if aggregate_chunks: 
                result_chunk = aggregation(chunk, output_index, chunk_agg_func)
            else: 
                result_chunk = chunk

    #             print result_chunk.head()

            yr_result_list += [result_chunk]
            if verbose: print("Number of rows ",result_chunk.shape[0])
            if verbose: print("Loop ",i,"took %s seconds" % (time.time() - start_time))
            del(result_chunk) 


        concat_result = pd.concat(yr_result_list)
    #         print concat_result.head()

        # Unique users vs Number of rows after the first computation    
        if verbose: print("size of concat_result:", len(concat_result))
    #         if verbose: print("unique firms in concat_result:", len(concat_result.index.unique()))

        result = concat_result
        result.columns = ['_'.join(col).strip() for col in result.columns.values]

        if aggregate_years:
    #             yr_agg_result = result.groupby(index).agg(yr_agg_func)
            yr_agg_result = aggregation(result, output_index, yr_agg_func)
            yr_agg_result.index = pd.MultiIndex.from_tuples(list(yr_agg_result.index), names=tuple(output_index))
            yr_agg_result.columns = yr_agg_result.columns.droplevel(1)

        else:
            yr_agg_result = result

        if verbose: print("size of yr_agg_result:", len(yr_agg_result))

        l += [yr_agg_result]

        if save_yr_agg_result:
            yr_agg_result.to_csv(save_path+'/'+sampling_name+'_'+str(y)+'.csv')

    filename = save_path+'/'+sampling_name+'.csv'
    pd.concat(l).to_csv(filename, index = True)
    print 'saved at: '+str(filename)



In [None]:
# # Imports and exports by firm/year

# IDs = firm_stats['ID']
# Mns = [1, 2, 3]

# def sample_data(columns, index = ['ID', 'IMPORT','YEAR'], chunk_agg = True, chunk_agg_func = {}, 
#             yr_agg_func = {}, name = 'test_sampling', n_chunks = 30.,
#                 firm_filtering = True, firm_filtering_cols = ['ID'], firm_filtering_arrays = [IDs],
#                 time_filtering = False, time_filtering_arrays = [Mns],
#                 verbose = False, yr_chunks_agg = True, save_yr_agg_result = False,
#                data_path = path1, save_path = path1+'/samplings/', select_IMPORTS = None):

#     names = tuple(index)
#     sampling_name = name

#     l = []

#     for y in years:
#         print 'year: '+str(y)
#         #Rows for each chunk
#         size = np.ceil(nrows1[y]/n_chunks)
#         print 'max_chunk_size: '+str(size)

#         reader = pd.read_csv(data_path+'/DP1610_MAASTRICHT1_'+str(y)+'.txt', chunksize = size, 
#                              usecols = map(colname_no.get, columns)
#                              , delimiter = ';', header = None) #'CN ID 8', 'PYOD'

#         yr_result_list = []

#         for i in range(int(n_chunks)):
#             start_time = time.time()
#             chunk = next(reader).rename(colno_name, axis = 1)
#             chunk['IMPORT'] = chunk['FLUX'] % 2 # so that '1' indicates imports
# #             print chunk.head()

#             if firm_filtering == True:
#                 for i in range(len(firm_filtering_cols)):
#                     chunk = chunk.loc[chunk[firm_filtering_cols[i]].isin(firm_filtering_arrays[i])]
#                     print 'filtered_chunk_size: '+str(len(chunk))
# #             print chunk.head()
                    
#             if time_filtering == True:
#                 time_filtering_cols = ['MONTH']
#                 for i in range(len(time_filtering_cols)):
#                     chunk = chunk.loc[chunk[time_filtering_cols[i]].isin(time_filtering_arrays[i])]
#                     print 'filtered_chunk_size: '+str(len(chunk))

#             if select_IMPORTS != None: chunk = chunk.loc[chunk.IMPORT == select_IMPORTS]

#             if chunk_agg: 
#                 result_chunk = aggregation(chunk, index, chunk_agg_func)
#             else: 
#                 result_chunk = chunk
                
# #             print result_chunk.head()

#             yr_result_list += [result_chunk]
#             if verbose: print("Number of rows ",result_chunk.shape[0])
#             if verbose: print("Loop ",i,"took %s seconds" % (time.time() - start_time))
#             del(result_chunk) 


#         concat_result = pd.concat(yr_result_list)
# #         print concat_result.head()
        
#         # Unique users vs Number of rows after the first computation    
#         if verbose: print("size of concat_result:", len(concat_result))
# #         if verbose: print("unique firms in concat_result:", len(concat_result.index.unique()))

#         result = concat_result
#         result.columns = ['_'.join(col).strip() for col in result.columns.values]

#         if yr_chunks_agg:
# #             yr_agg_result = result.groupby(index).agg(yr_agg_func)
#             yr_agg_result = aggregation(result, index, yr_agg_func)
#             yr_agg_result.index = pd.MultiIndex.from_tuples(list(yr_agg_result.index), names=names)
#             yr_agg_result.columns = yr_agg_result.columns.droplevel(1)

#         else:
#             yr_agg_result = result

#         if verbose: print("size of yr_agg_result:", len(yr_agg_result))

#         l += [yr_agg_result]

#         if save_yr_agg_result:
#             yr_agg_result.to_csv(save_path+'/'+sampling_name+'_'+str(y)+'.csv')
    
#     filename = save_path+'/'+sampling_name+'.csv'
#     pd.concat(l).to_csv(filename, index = True)
#     print 'saved at: '+str(filename)



In [None]:
# sizes_index = pd.read_csv('./../export_france/data/formatted/sizes_index.csv')

# n = 10

# sizes_index['log_exp_mma'] = np.log10(sizes_index.exp_mma).round(3)
# cuts = pd.cut(sizes_index.groupby(sizes_index['log_exp_mma']).sum().cumsum()['exp_mma'],n, labels=range(n))
# exp_index = sizes_index.merge(cuts.reset_index().rename({'exp_mma': 'exp_mma_cat'}, axis = 1), on = 'log_exp_mma').dropna(subset = ['exp_mma'])

In [None]:
# # Oct 4
# sizes_index = pd.read_csv('./../export_france/data/formatted/sizes_index.csv', index_col=0)
# n = 20
# exp_index = sizes_index.dropna(subset = ['exp_mma'])
# exp_index['exp_qcut_'+str(n)+'_label'] = pd.qcut(exp_index['exp_mma'], n, labels=range(n))
# exp_index['exp_qcut_'+str(n)] = pd.qcut(exp_index['exp_mma'], n)

# # imp_index = sizes_index.dropna(subset = ['imp_mma'])
# # imp_index['imp_cut_'+str(n)+'_label'] = pd.cut(imp_index['imp_mma'], n, labels=range(n))
# # imp_index['imp_cut_'+str(n)] = pd.cut(imp_index['imp_mma'], n)

In [None]:

# Mns = range(1,13)
# time_filtering_arrays = [Mns]

# # use all firms
# sample_n_firms = 'max'

# years = list(reversed(range(1997, 2014)))

# for bin_label in list(reversed(range(n))):
#     print 'bin label: '+str(bin_label)
#     IDs = exp_index.loc[exp_index.exp_mma_cat == bin_label].index
#     print IDs
# #     IDs = imp_index.loc[imp_index.imp_cut_20_label == bin_label].index
#     columns = ['YEAR', 'MONTH', 'FLUX', 'ID', 'CN ID 8', 'VAT','VART']
#     sample_data(columns, index = ['YEAR', 'MONTH', 'IMPORT', 'ID', 'CN ID 8', 'VAT'], chunk_agg_func = {'VART':['sum']}, 
#                 firm_filtering = True, firm_filtering_cols = ['ID'], firm_filtering_arrays = [IDs],
#                 yr_agg_func = {'VART_sum':['sum']}, n_chunks = 10., 
#                 name = 'firm_sample_YMxpb_n'+str(n).zfill(2)+str(bin_label).zfill(2),
#                 verbose = False, yr_chunks_agg = True, select_IMPORTS = 0, save_yr_agg_result = True)
