In [1]:
# Import necessary modules
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from numpy import arange, log10

# Import custom functions, add path ./../functions to sys.path
import sys
sys.path.append('./../../scripts/')
from functions import chunk, agg, finalize

# Set up Matplotlib to display plots inline
%matplotlib inline


In [4]:

# Define custom aggregation functions using Dask
tunique = dd.Aggregation('tunique', chunk, agg, finalize)
first = dd.Aggregation('first', chunk, agg, finalize)

# Define the path to the dataset
drive_path = '/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013'

# Define the number of bins and initialize a list to hold DataFrames
n_bins = 20
df_list = []

# Read data from CSV files and append to the list
for dataset_i in range(n_bins):
    filename = f'{drive_path}/samplings/YMxpb_size20{str(dataset_i).zfill(2)}.csv'
    df = dd.read_csv(filename, usecols=['YEAR', 'ID', 'VAT', 'VART_sum'])
    df_list.append(df)

# Concatenate all DataFrames and compute the sum of 'VART_sum'
data = dd.concat(df_list)
data = data.groupby(['YEAR', 'ID', 'VAT'])['VART_sum'].sum().reset_index()

# Compute and convert the Dask DataFrame to a Pandas DataFrame
with ProgressBar():
    out = data.compute()

# Save the computed data to a CSV file
out.to_csv('./../../data/buyer_seller_links.csv', index=False)


FileNotFoundError: An error occurred while calling the read_csv method registered to the pandas backend.
Original Message: [Errno 2] No such file or directory: '/media/matias/Elements/export_france/data/type1/DP1610_MAASTRICHT1_1997_2013/samplings/YMxpb_size2000.csv'

In [None]:
import pandas as pd
from dask.diagnostics import ProgressBar
from numpy import arange, log10


for window in [1, 3, 5]:
    gap = (window - 1) / 2
    center_years = arange(1997, 2014, 2)

    for Yc in center_years:
        print(f'Window: {window}, Center Year: {Yc}')

        # Filter data for the given window and center year
        data_sec = data.loc[data.YEAR - Yc <= gap]

        # Calculate ID_degree
        data_sec_by_ID = data_sec.groupby(['ID']).agg({'VAT': tunique, 'VART_sum': sum})
        ID_degree = data_sec_by_ID[['VAT']].reset_index()
        ID_degree.columns = ['ID', 'ID_degree']
        ID_degree['center_year'] = Yc
        ID_degree['window'] = window

        # Compute and save ID_deg
        with ProgressBar():
            ID_deg = ID_degree.compute()
            ID_deg['bin'] = pd.cut(log10(ID_deg['ID_degree']), bins=arange(-.49, 5.99, .25))
            ID_deg.to_csv('ID_deg_'+str(Yc)+'_'+str(window)+'.csv')

        # Sample IDs
        sampling = ID_deg.groupby(['bin'], observed=True).apply(lambda x: x.sample(200, replace=True))
        data_sec_sample = data_sec.loc[data_sec.ID.isin(sampling['ID'].values)]

        # Calculate VAT_degree
        data_sec_by_VAT = data_sec_sample.groupby(['VAT']).agg({'ID': tunique, 'VART_sum': sum})
        VAT_degree = data_sec_by_VAT[['ID']].reset_index()
        VAT_degree.columns = ['VAT', 'VAT_degree']
        VAT_degree['center_year'] = Yc
        VAT_degree['window'] = window

        # Compute and save VAT_deg
        with ProgressBar():
            VAT_deg = VAT_degree.compute()
            VAT_deg.to_csv('VAT_deg_save_'+str(Yc)+'_'+str(window)+'.csv')


5
1997
[###############                         ] | 37% Completed | 22.9s

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from numpy import arange, log10, power

# Set matplotlib to inline and define a function for histogram plotting
%matplotlib inline

def plot_histogram(data, column, bins_range):
    plt.hist(log10(data[column]), bins=bins_range)
    plt.show()

# Define path for data
data_path = './../../data/'

In [None]:

# Concatenate and compute VAT degrees
VAT_degree_res = dd.concat(VAT_degree_res)  # Assuming VAT_degree_res is defined elsewhere
with ProgressBar():
    VAT_deg = VAT_degree_res.compute()

# Save VAT degrees to file
VAT_deg.to_csv(data_path + 'VAT_deg_save.csv', index=False)

# ID_deg_res = dd.concat(ID_degree_res)
# with ProgressBar():
#     ID_deg = ID_deg_res.compute()
# ID_deg.to_csv(data_path + 'ID_deg_save.csv', index=False)

# Merge and compute assortativity information
# Assuming data_sec_sample, ID_degree, VAT_degree are defined and loaded
assortativity_info = data_sec_sample.groupby(['ID', 'VAT'])[['VART_sum']].mean().reset_index().merge(
    ID_degree).merge(VAT_degree)

with ProgressBar():
    assortativity_df = assortativity_info.compute()

In [None]:
# Plotting the histogram of ID degrees
plot_histogram(ID_deg, 'ID_degree', arange(-.49, 5.99, .25))  # Assuming ID_deg is defined and loaded


In [None]:

# Creating bins for degree data
left = power(10, arange(-.1, 5, .2)).round(1)
right = power(10, arange(-.1, 5, .2) + .2).round(1)
bins = pd.IntervalIndex.from_arrays(left, right)

# Sample IDs and prepare data for assortativity analysis
# Assuming ID_deg_part is a filtered version of ID_deg
ID_deg_part = ID_deg.loc[(ID_deg.center_year == Yc) & (ID_deg.window == window)]
ID_deg_part.groupby(pd.cut(ID_deg_part['ID_degree'], bins), observed = True).count().sort_index()

sample_IDs = ID_deg_part.groupby(pd.cut(ID_deg_part['ID_degree'], bins), observed=True).apply(
    lambda x: x.sample(1000, replace=True))['ID'].values
data_sec = data.loc[(data.YEAR - Yc <= gap) & (data.ID.isin(sample_IDs))]  # Assuming data, Yc, gap are defined

# Compute summary results
summary_results = []
for df_degrees in results:  # Assuming results is defined
    with ProgressBar():
        x = df_degrees.compute()
    summary_result = x.groupby(pd.cut(x['ID_degree'], bins)).agg({...})  # Add relevant aggregation
    summary_results.append(summary_result)

# Concatenate and save summary results
pd.concat(summary_results).dropna().to_csv(data_path + 'assortativity_summary.csv')


In [None]:

# Plotting Quantile Plots
# Assuming df_degrees is defined and loaded
fig, ax = plt.subplots(1)
df_degrees.groupby('VAT_degree_bin').quantile([.25, .5, .75]).plot(
    x='VAT_degree', y='ID_degree', marker='', ax=ax)
ax.set_xscale('log')
ax.set_yscale('log')
plt.show()



In [None]:
# ### Choose a bin_size, 
# size_df_list = n_size_bins * [ None ]

# for s in range(14, n_size_bins):

#     bs_ix_df = exp_index.loc[exp_index.size_bins == s]

#     size_i_df_list = []
#     for dataset_i in bs_ix_df.exp_mma_cat.unique():
#         df = df_list[dataset_i]
#         size_i_df_list += [df.loc[df.ID.isin(bs_ix_df.ID)]]

#         size_df_list[s] = pd.concat(size_i_df_list)