In [1]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Notebook settings for better control over execution and display
%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.max_rows = 10


In [2]:
# Define the data paths for reading and saving files
drive_path = '/media/matias/Elements/export_france/data/type1/'
save_path = './../../data/processed/'

# Column names and their corresponding numbers in the raw data files
colnames = ['YEAR', 'MONTH', 'FLUX', 'ID', 'DEPT', 'CN ID 8', 'CPA6', 'PYOD', 'PAYP', 'VAT', 
            'PRIFAC', 'DEVFAC', 'VFTE', 'VART', 'D_MASSE', 'MASSE', 'USUP', 'USUP_MT']
colname_no = {name: idx for idx, name in enumerate(colnames)}


In [3]:
# Function to read and preprocess the data
def get_data(columns, start_year, end_year):
    dtype = {4: 'object', 9: 'object'}  # Specify the correct dtype for known columns
    df_list = []
    for y in range(start_year, end_year):
        df = dd.read_csv(f'{drive_path}DP1610_MAASTRICHT1_{y}.txt', usecols=columns, 
                         delimiter=';', header=None, dtype=dtype, blocksize='100MB')
        df.columns = [colnames[i] for i in columns]  # Set the column names
        df_list.append(df)
    return dd.concat(df_list)

# Example usage:
# columns_to_load = [0, 2, 3, 5, 7, 13]  # YEAR, FLUX, ID, CN ID 8, PYOD, VART
# data = get_data(columns_to_load, 1997, 2014)


In [5]:
# Function to calculate and save firm sizes and buyer-seller links
def compute_firm_sizes_and_links(data):
    firm_sizes = data.groupby(['ID', 'IMPORT', 'YEAR'])['VART'].sum().reset_index()
    buyer_seller_links = data.groupby(['ID', 'VAT', 'YEAR'])['VART'].sum().reset_index()

    # Persisting the results to avoid re-computation
    firm_sizes.to_csv(f'{save_path}firm_sizes.csv', index=False)
    buyer_seller_links.to_csv(f'{save_path}buyer_seller_links.csv', index=False)

    return firm_sizes, buyer_seller_links

# Example usage:
# firm_sizes, buyer_seller_links = compute_firm_sizes_and_links(data)


In [8]:

# Example usage:
columns_to_load = [0, 2, 3, 5, 7, 13]  # YEAR, FLUX, ID, CN ID 8, PYOD, VART
data = get_data(columns_to_load, 1997, 1999)

firm_sizes, buyer_seller_links = compute_firm_sizes_and_links(data)


FileNotFoundError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: [Errno 2] No such file or directory: '/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997.txt'

In [4]:
# Save intermediate results for later use
firm_sizes.to_parquet(f'{save_path}firm_sizes.parquet')
buyer_seller_links.to_parquet(f'{save_path}buyer_seller_links.parquet')

# Save Dask dataframe as parquet for efficient access later on
data.to_parquet(f'{save_path}full_dataset.parquet')


NameError: name 'firm_sizes' is not defined