In [39]:
import pandas as pd
import dask.dataframe as dd
from dask.diagnostics import ProgressBar

# Notebook settings for better control over execution and display
%matplotlib inline
pd.options.display.max_columns = None
pd.options.display.max_rows = 10


In [40]:
# Define the data paths for reading and saving files
drive_path = '/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/'
save_path = './../../data/processed/'

# Column names and their corresponding numbers in the raw data files
colnames = ['YEAR', 'MONTH', 'FLUX', 'ID', 'DEPT', 'CN ID 8', 'CPA6', 'PYOD', 'PAYP', 'VAT', 
            'PRIFAC', 'DEVFAC', 'VFTE', 'VART', 'D_MASSE', 'MASSE', 'USUP', 'USUP_MT']
colname_no = {name: idx for idx, name in enumerate(colnames)}


In [41]:
# Function to read and preprocess the data
def get_data(columns, start_year, end_year):
    dtype = {4: 'object', 9: 'object'}  # Specify the correct dtype for known columns
    df_list = []
    for y in range(start_year, end_year):
        df = dd.read_csv(f'{drive_path}DP1610_MAASTRICHT1_{y}.txt', usecols=columns, 
                         delimiter=';', header=None, dtype=dtype, blocksize='100MB')
        df.columns = [colnames[i] for i in columns]  # Set the column names
        df_list.append(df)
    return dd.concat(df_list)

# Example usage:
# columns_to_load = [0, 2, 3, 5, 7, 13]  # YEAR, FLUX, ID, CN ID 8, PYOD, VART
# data = get_data(columns_to_load, 1997, 2014)


In [42]:
# Function to calculate and save firm sizes and buyer-seller links
def compute_firm_sizes_and_links(data):
    firm_sizes = data.groupby(['ID', 'IMPORT', 'YEAR'])['VART'].sum().reset_index()
    buyer_seller_links = data.groupby(['ID', 'VAT', 'YEAR'])['VART'].sum().reset_index()

    # Persisting the results to avoid re-computation
    # firm_sizes.to_csv(f'{save_path}firm_sizes.csv', index=False)
    # buyer_seller_links.to_csv(f'{save_path}buyer_seller_links.csv', index=False)
    firm_sizes.to_csv(f'{save_path}firm_sizes.csv', index=False, single_file=True)
    buyer_seller_links.to_csv(f'{save_path}buyer_seller_links.csv', index=False, single_file=True)


    return firm_sizes, buyer_seller_links

# Example usage:
# firm_sizes, buyer_seller_links = compute_firm_sizes_and_links(data)


In [43]:
# FileNotFoundError: [Errno 2] No such file or directory: '/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997.txt'
# '/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/'

In [44]:

# Example usage:
columns_to_load = [0, 2, 3, 5, 7, 9, 13]  # YEAR, FLUX, ID, CN ID 8, PYOD, VART
data = get_data(columns_to_load, 1997, 1999)
data['IMPORT'] = data['FLUX'] % 2

firm_sizes, buyer_seller_links = compute_firm_sizes_and_links(data)


KeyboardInterrupt: 

In [None]:
data?

[0;31mType:[0m        DataFrame
[0;31mString form:[0m
Dask DataFrame Structure:
           YEAR   FLUX     ID CN ID 8    PYOD     VAT   VART IMPO <...>         ...    ...    ...     ...     ...     ...    ...    ...
           Dask Name: assign, 8 graph layers
[0;31mLength:[0m      40209836
[0;31mFile:[0m        ~/anaconda3/envs/base2/lib/python3.11/site-packages/dask/dataframe/core.py
[0;31mDocstring:[0m  
Parallel Pandas DataFrame

Do not use this class directly.  Instead use functions like
``dd.read_csv``, ``dd.read_parquet``, or ``dd.from_pandas``.

Parameters
----------
dsk: dict
    The dask graph to compute this DataFrame
name: str
    The key prefix that specifies which keys in the dask comprise this
    particular DataFrame
meta: pandas.DataFrame
    An empty ``pandas.DataFrame`` with names, dtypes, and index matching
    the expected output.
divisions: tuple of index values
    Values along which we partition our blocks on the index

In [None]:
data.shape

(Delayed('int-fd56ec31-6f24-4381-979b-c3dc2ef978fd'), 8)

In [None]:
# pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-14.0.2-cp311-cp311-manylinux_2_28_x86_64.whl (38.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.0/38.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pyarrow
Successfully installed pyarrow-14.0.2
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Save intermediate results for later use
firm_sizes.to_parquet(f'{save_path}firm_sizes.parquet')
buyer_seller_links.to_parquet(f'{save_path}buyer_seller_links.parquet')

# Save Dask dataframe as parquet for efficient access later on
data.to_parquet(f'{save_path}full_dataset.parquet')
# 7 mins

In [None]:
'/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/DP1610_MAASTRICHT1_1998.txt'
'/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/DP1610_MAASTRICHT1_1998.txt'
