# Data reading and sampling

In [9]:
import pandas as pd
import numpy as np
import time

In [10]:
# Function definitions
def aggregation(chunk, index, func):
    grouped_object = chunk.groupby(index,sort = False)
    # not sorting results in a minor speedup
    return grouped_object.agg(func)

def merge_ref(df, filter_ref, filter_ix):
    return df.reset_index().merge(filter_ref, on = filter_ix, how = 'left').set_index('index')
     
def select_entries(df, filter_col, value, by_df = None):
    if by_df is None:
        by_df = df
        
    if isinstance(value, (list,)):
        return df.loc[by_df[filter_col].isin(value)]
    else:
        return df.loc[by_df[filter_col] == value]

def aggregate_chunk(chunk, output_index, chunk_agg_func, verbose):
    result_chunk = aggregation(chunk, output_index, chunk_agg_func) if aggregate_chunks else chunk
    if verbose:
        print("Number of rows ", result_chunk.shape[0])
    return result_chunk

def process_year(year, n_chunks, data_path, columns, output_index, chunk_agg_func, verbose):
    size = np.ceil(nrows1[year] / n_chunks)
    reader = pd.read_csv(f'{data_path}/DP1610_MAASTRICHT1_{year}.txt', chunksize=size, 
                         usecols=map(colname_no.get, columns), delimiter=';', header=None)
    yr_result_list = []

    for i in range(int(n_chunks)):
        start_time = time.time()
        chunk = next(reader).rename(colname_no, axis=1)
        chunk['IMPORT'] = chunk['FLUX'] % 2

        chunk = select_entries(chunk, filter_col='IMPORT', value=0)
        chunk_ = merge_ref(chunk, exp_size_ref, filter_ix='ID')
        chunk = select_entries(chunk, filter_col='exp_mma_cat', value=bin_, by_df=chunk_)

        yr_result_list.append(aggregate_chunk(chunk, output_index, chunk_agg_func, verbose))
        if verbose:
            print("Loop ", i, "took %s seconds" % (time.time() - start_time))

    return pd.concat(yr_result_list)

def save_yearly_aggregated_result(year_result, output_index, yr_agg_func, save_path, sampling_name, year):
    yr_agg_result = aggregation(year_result, output_index, yr_agg_func)
    yr_agg_result.index = pd.MultiIndex.from_tuples(list(yr_agg_result.index), names=tuple(output_index))
    yr_agg_result.columns = yr_agg_result.columns.droplevel(1)
    yr_agg_result.to_csv(f'{save_path}/{sampling_name}_{year}.csv')

def main():
    years = list(reversed(range(1997, 2014)))
    n_chunks = 5
    data_path = './../../data/type1/DP1610_MAASTRICHT1_1997_2013'
    columns = ['YEAR', 'MONTH', 'FLUX', 'ID', 'CN ID 8', 'VAT', 'VART']
    output_index = ['YEAR', 'MONTH', 'IMPORT', 'ID', 'CN ID 8', 'VAT']
    chunk_agg_func = {'VART': ['sum']}
    yr_agg_func = {'VART_sum': ['sum']}
    verbose = True
    save_path = './../../data/samplings'

    for bin_ in reversed(range(5)):
        sampling_name = f'YMxpb_size{str(n_bins).zfill(2)}{str(bin_).zfill(2)}'
        all_year_results = []

        for year in years:
            print(f'Processing year: {year}')
            year_result = process_year(year, n_chunks, data_path, columns, output_index, chunk_agg_func, verbose)
            all_year_results.append(year_result)
            save_yearly_aggregated_result(year_result, output_index, yr_agg_func, save_path, sampling_name, year)

        pd.concat(all_year_results).to_csv(f'{save_path}/{sampling_name}.csv', index=True)
        print(f'Saved at: {save_path}/{sampling_name}.csv')


### Useful functions for chunk filtering and aggregation

In [11]:

data_path = '/media/matias/Elements/export_france/data/'
path1 = data_path+'type1/DP1610_MAASTRICHT1_1997_2013'
path2 = data_path+'type2/DP1611_MAASTRICHT2_1997_2013'

# Writing as a function

                                
colnames = [u'YEAR', u'MONTH', u'FLUX', u'ID', u'DEPT', u'CN ID 8', u'CPA6',
       u'PYOD', u'PAYP', u'VAT', u'PRIFAC', u'DEVFAC', u'VFTE', u'VART', u'D_MASSE', u'MASSE', u'USUP', u'USUP_MT']

colname_no = dict(zip(colnames, range(18)))
colno_name = dict(zip(range(18), colnames))



In [13]:

# #We need to know number of rows of each file:
# l = []
# with open(data_path+'/type1_row_ct') as f:
#     for line in f:
#         l += [int(line.strip())]
# nrows1 = dict(zip(range(1997, 2014), l))



In [14]:
import subprocess

nrows1 = {}
years = range(1997, 2014)

for y in years:
    print(f'Processing year: {y}')
    file_path = f'{path1}/DP1610_MAASTRICHT1_{y}.txt'

    # Use wc -l to count the number of lines in the file
    process = subprocess.Popen(['wc', '-l', file_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    out, err = process.communicate()

    if process.returncode != 0:
        print(f'Error processing file for year {y}: {err}')
    else:
        # The output is 'number_of_lines filename', so split and take the first part
        row_count = int(out.decode('utf-8').split()[0])
        nrows1[y] = row_count
        print(f'Year {y} has {row_count} rows')

# nrows1 now contains the number of rows for each year
print(nrows1)

Processing year: 1997


Year 1997 has 19303127 rows
Processing year: 1998
Year 1998 has 20906709 rows
Processing year: 1999
Year 1999 has 22004431 rows
Processing year: 2000
Year 2000 has 23044998 rows
Processing year: 2001
Year 2001 has 23172197 rows
Processing year: 2002
Year 2002 has 23696908 rows
Processing year: 2003
Year 2003 has 24498599 rows
Processing year: 2004
Year 2004 has 26093520 rows
Processing year: 2005
Year 2005 has 27798440 rows
Processing year: 2006
Year 2006 has 28949804 rows
Processing year: 2007
Year 2007 has 30802487 rows
Processing year: 2008
Year 2008 has 31545582 rows
Processing year: 2009
Year 2009 has 31393346 rows
Processing year: 2010
Year 2010 has 34968248 rows
Processing year: 2011
Year 2011 has 35979867 rows
Processing year: 2012
Year 2012 has 38311600 rows
Processing year: 2013
Year 2013 has 40902059 rows
{1997: 19303127, 1998: 20906709, 1999: 22004431, 2000: 23044998, 2001: 23172197, 2002: 23696908, 2003: 24498599, 2004: 26093520, 2005: 27798440, 2006: 28949804, 2007: 30802

### Load and process references for filtering (eg. data on firm sizes to filter by size)

In [21]:
import pandas as pd
import numpy as np

# Assuming median_sizes is already computed and saved as per the previous script
median_sizes = pd.read_csv('./../../data/processed/sizes_index.csv')

# The median_sizes DataFrame already contains the 'exp_mma_cat' column, so we can directly use it
# Create a reference dataframe that contains only the IDs and their corresponding 'exp_mma' category
exp_size_ref = median_sizes[['ID', 'exp_mma_cat']]

# Define column names for filtering operations
filter_ix = 'ID'
filter_col = 'exp_mma_cat'


### Main loop for reading from hard drive and saving

In [22]:

main()


NameError: name 'n_bins' is not defined

In [None]:
xx

In [None]:
# # names = tuple(index)
# # sampling_name = name

# years = list(reversed(range(1997, 2014)))
# n_chunks = 5
# data_path = data_path+'type1/DP1610_MAASTRICHT1_1997_2013'
# columns = ['YEAR', 'MONTH', 'FLUX', 'ID', 'CN ID 8', 'VAT','VART']
# aggregate_chunks = True
# chunk_agg_func = {'VART':['sum']}
# aggregate_years = True
# yr_agg_func = {'VART_sum':['sum']}
# verbose = True
# output_index = ['YEAR', 'MONTH', 'IMPORT', 'ID', 'CN ID 8', 'VAT']

# save_yr_agg_result = True
# save_path = path1+'/samplings'

# for bin_ in list(reversed(range(5))): #n_bins
#     print 'bin no. '+str(bin_)
#     sampling_name = 'YMxpb_size'+str(n_bins).zfill(2)+str(bin_).zfill(2)
#     l = []

#     for y in years:
#         print 'year: '+str(y)
#         #Rows for each chunk
#         size = np.ceil(nrows1[y]/n_chunks)
#         print 'max_chunk_size: '+str(size)

#         reader = pd.read_csv(data_path+'/DP1610_MAASTRICHT1_'+str(y)+'.txt', chunksize = size, 
#                              usecols = map(colname_no.get, columns)
#                              , delimiter = ';', header = None) #'CN ID 8', 'PYOD'

#         yr_result_list = []

#         #Loop over chunks
#         for i in range(int(n_chunks)):
#             start_time = time.time()
#             chunk = next(reader).rename(colno_name, axis = 1)
#             chunk['IMPORT'] = chunk['FLUX'] % 2 # so that '1' indicates imports

#             """
#             Chunk filtering
#             """
#             #select imports/exports
#             chunk = select_entries(chunk, filter_col = 'IMPORT', value = 0)

#             # select by firm size
#             chunk_ = merge_ref(chunk, exp_size_ref, filter_ix = 'ID')
#             chunk = select_entries(chunk, filter_col = 'exp_mma_cat', value = bin_, by_df = chunk_)

#             """
#             Chunk aggregation
#             """
#             if aggregate_chunks: 
#                 result_chunk = aggregation(chunk, output_index, chunk_agg_func)
#             else: 
#                 result_chunk = chunk

#     #             print result_chunk.head()

#             yr_result_list += [result_chunk]
#             if verbose: print("Number of rows ",result_chunk.shape[0])
#             if verbose: print("Loop ",i,"took %s seconds" % (time.time() - start_time))
#             del(result_chunk) 


#         concat_result = pd.concat(yr_result_list)
#     #         print concat_result.head()

#         # Unique users vs Number of rows after the first computation    
#         if verbose: print("size of concat_result:", len(concat_result))
#     #         if verbose: print("unique firms in concat_result:", len(concat_result.index.unique()))

#         result = concat_result
#         result.columns = ['_'.join(col).strip() for col in result.columns.values]

#         if aggregate_years:
#     #             yr_agg_result = result.groupby(index).agg(yr_agg_func)
#             yr_agg_result = aggregation(result, output_index, yr_agg_func)
#             yr_agg_result.index = pd.MultiIndex.from_tuples(list(yr_agg_result.index), names=tuple(output_index))
#             yr_agg_result.columns = yr_agg_result.columns.droplevel(1)

#         else:
#             yr_agg_result = result

#         if verbose: print("size of yr_agg_result:", len(yr_agg_result))

#         l += [yr_agg_result]

#         if save_yr_agg_result:
#             yr_agg_result.to_csv(save_path+'/'+sampling_name+'_'+str(y)+'.csv')

#     filename = save_path+'/'+sampling_name+'.csv'
#     pd.concat(l).to_csv(filename, index = True)
#     print 'saved at: '+str(filename)



In [None]:
# # Imports and exports by firm/year

# IDs = firm_stats['ID']
# Mns = [1, 2, 3]

# def sample_data(columns, index = ['ID', 'IMPORT','YEAR'], chunk_agg = True, chunk_agg_func = {}, 
#             yr_agg_func = {}, name = 'test_sampling', n_chunks = 30.,
#                 firm_filtering = True, firm_filtering_cols = ['ID'], firm_filtering_arrays = [IDs],
#                 time_filtering = False, time_filtering_arrays = [Mns],
#                 verbose = False, yr_chunks_agg = True, save_yr_agg_result = False,
#                data_path = path1, save_path = path1+'/samplings/', select_IMPORTS = None):

#     names = tuple(index)
#     sampling_name = name

#     l = []

#     for y in years:
#         print 'year: '+str(y)
#         #Rows for each chunk
#         size = np.ceil(nrows1[y]/n_chunks)
#         print 'max_chunk_size: '+str(size)

#         reader = pd.read_csv(data_path+'/DP1610_MAASTRICHT1_'+str(y)+'.txt', chunksize = size, 
#                              usecols = map(colname_no.get, columns)
#                              , delimiter = ';', header = None) #'CN ID 8', 'PYOD'

#         yr_result_list = []

#         for i in range(int(n_chunks)):
#             start_time = time.time()
#             chunk = next(reader).rename(colno_name, axis = 1)
#             chunk['IMPORT'] = chunk['FLUX'] % 2 # so that '1' indicates imports
# #             print chunk.head()

#             if firm_filtering == True:
#                 for i in range(len(firm_filtering_cols)):
#                     chunk = chunk.loc[chunk[firm_filtering_cols[i]].isin(firm_filtering_arrays[i])]
#                     print 'filtered_chunk_size: '+str(len(chunk))
# #             print chunk.head()
                    
#             if time_filtering == True:
#                 time_filtering_cols = ['MONTH']
#                 for i in range(len(time_filtering_cols)):
#                     chunk = chunk.loc[chunk[time_filtering_cols[i]].isin(time_filtering_arrays[i])]
#                     print 'filtered_chunk_size: '+str(len(chunk))

#             if select_IMPORTS != None: chunk = chunk.loc[chunk.IMPORT == select_IMPORTS]

#             if chunk_agg: 
#                 result_chunk = aggregation(chunk, index, chunk_agg_func)
#             else: 
#                 result_chunk = chunk
                
# #             print result_chunk.head()

#             yr_result_list += [result_chunk]
#             if verbose: print("Number of rows ",result_chunk.shape[0])
#             if verbose: print("Loop ",i,"took %s seconds" % (time.time() - start_time))
#             del(result_chunk) 


#         concat_result = pd.concat(yr_result_list)
# #         print concat_result.head()
        
#         # Unique users vs Number of rows after the first computation    
#         if verbose: print("size of concat_result:", len(concat_result))
# #         if verbose: print("unique firms in concat_result:", len(concat_result.index.unique()))

#         result = concat_result
#         result.columns = ['_'.join(col).strip() for col in result.columns.values]

#         if yr_chunks_agg:
# #             yr_agg_result = result.groupby(index).agg(yr_agg_func)
#             yr_agg_result = aggregation(result, index, yr_agg_func)
#             yr_agg_result.index = pd.MultiIndex.from_tuples(list(yr_agg_result.index), names=names)
#             yr_agg_result.columns = yr_agg_result.columns.droplevel(1)

#         else:
#             yr_agg_result = result

#         if verbose: print("size of yr_agg_result:", len(yr_agg_result))

#         l += [yr_agg_result]

#         if save_yr_agg_result:
#             yr_agg_result.to_csv(save_path+'/'+sampling_name+'_'+str(y)+'.csv')
    
#     filename = save_path+'/'+sampling_name+'.csv'
#     pd.concat(l).to_csv(filename, index = True)
#     print 'saved at: '+str(filename)



In [None]:
# sizes_index = pd.read_csv('./../export_france/data/formatted/sizes_index.csv')

# n = 10

# sizes_index['log_exp_mma'] = np.log10(sizes_index.exp_mma).round(3)
# cuts = pd.cut(sizes_index.groupby(sizes_index['log_exp_mma']).sum().cumsum()['exp_mma'],n, labels=range(n))
# exp_index = sizes_index.merge(cuts.reset_index().rename({'exp_mma': 'exp_mma_cat'}, axis = 1), on = 'log_exp_mma').dropna(subset = ['exp_mma'])

In [None]:
# # Oct 4
# sizes_index = pd.read_csv('./../export_france/data/formatted/sizes_index.csv', index_col=0)
# n = 20
# exp_index = sizes_index.dropna(subset = ['exp_mma'])
# exp_index['exp_qcut_'+str(n)+'_label'] = pd.qcut(exp_index['exp_mma'], n, labels=range(n))
# exp_index['exp_qcut_'+str(n)] = pd.qcut(exp_index['exp_mma'], n)

# # imp_index = sizes_index.dropna(subset = ['imp_mma'])
# # imp_index['imp_cut_'+str(n)+'_label'] = pd.cut(imp_index['imp_mma'], n, labels=range(n))
# # imp_index['imp_cut_'+str(n)] = pd.cut(imp_index['imp_mma'], n)

In [None]:

# Mns = range(1,13)
# time_filtering_arrays = [Mns]

# # use all firms
# sample_n_firms = 'max'

# years = list(reversed(range(1997, 2014)))

# for bin_label in list(reversed(range(n))):
#     print 'bin label: '+str(bin_label)
#     IDs = exp_index.loc[exp_index.exp_mma_cat == bin_label].index
#     print IDs
# #     IDs = imp_index.loc[imp_index.imp_cut_20_label == bin_label].index
#     columns = ['YEAR', 'MONTH', 'FLUX', 'ID', 'CN ID 8', 'VAT','VART']
#     sample_data(columns, index = ['YEAR', 'MONTH', 'IMPORT', 'ID', 'CN ID 8', 'VAT'], chunk_agg_func = {'VART':['sum']}, 
#                 firm_filtering = True, firm_filtering_cols = ['ID'], firm_filtering_arrays = [IDs],
#                 yr_agg_func = {'VART_sum':['sum']}, n_chunks = 10., 
#                 name = 'firm_sample_YMxpb_n'+str(n).zfill(2)+str(bin_label).zfill(2),
#                 verbose = False, yr_chunks_agg = True, select_IMPORTS = 0, save_yr_agg_result = True)
