In [7]:
import json
from collections import Counter

# Load the JSON data
with open('/home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/product_data.json', 'r') as json_file:
    data = json.load(json_file)

# Function to remove 'desconocido' from all values in the dictionary
def remove_desconocido(data):
    for field, importers in data.items():
        for importer, values in importers.items():
            # Filter out 'desconocido' from the list of values
            data[field][importer] = [value for value in values if value != 'desconocido']
    return data

# Remove 'desconocido' values
cleaned_data = remove_desconocido(data)

# Function to remove duplicates and keep the counts for each value
def remove_duplicates_and_count(data):
    for field, importers in data.items():
        for importer, values in importers.items():
            # Use Counter to count occurrences of each value
            value_counts = Counter(values)
            # Get only the unique values (no duplicates) by using the keys of the Counter
            data[field][importer] = list(value_counts.keys())  # Keep unique values
    return data

# Remove duplicates and keep only unique values
unique_data = remove_duplicates_and_count(cleaned_data)

# Print sample output to check
for field, importers in unique_data.items():
    print(f"Field: {field}")
    for importer, values in importers.items():
        print(f"Importer: {importer} -> {values}")
    print("\n")

Field: telefono
Importer: acuarios_del_mar_s.a._acdelmar -> []
Importer: allamericancorp_s.a. -> []
Importer: aquagroup_cia._ltda. -> ['02485789-413']
Importer: arboleda_faini_juan_xavier -> []
Importer: automotores_de_francia_compañia_anonima_autofranci -> ['02_2482002']
Importer: automotores_y_anexos_s.a._ayasa -> ['02_2222740_ext_202']
Importer: camaronera_agromarina -> []
Importer: camarosursa_sa -> []
Importer: campoverde_palacios_marco_vinicio -> []
Importer: comercial_kywi_s.a. -> ['02_2501713__ext_2239_/_501722_/_501716']
Importer: consorcio_generacion_ecuador -> []
Importer: corpiecam_cia_ltda -> []
Importer: crustaceos_&_peces_de_sudamerica_s.a._crupesa -> []
Importer: crustaceos_&_peces_de_sudamerica_s_a_crupesa -> []
Importer: dolca_s.a. -> ['593_(4)_2240300']
Importer: dreu_sa -> ['099427930']
Importer: ecuacobre_fv.s.a -> ['02_2332233']
Importer: ecuayamata_s.a -> ['04_2323940']
Importer: ecuayamata_s.a. -> ['04_2323940']
Importer: edesa_sa -> ['02_2671717']
Importer: ele

In [8]:
unique_data_path = '/home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/product_data_unique.json'
with open(unique_data_path, 'w') as json_file:
    json.dump(unique_data, json_file, indent=4)

# Confirm save
print(f"Unique data saved to {unique_data_path}")

Unique data saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/product_data_unique.json


In [12]:
import csv
import os

# Path to the cleaned and unique JSON data
json_data_path = '/home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/product_data_unique.json'

# Path to save the CSV files
csv_output_path = '/home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/'

# Load the cleaned JSON data
with open(json_data_path, 'r') as json_file:
    data = json.load(json_file)

# Function to save each field as a separate CSV file with dynamic columns
def save_as_csv(data, output_path):
    # Create output directory if it doesn't exist
    if not os.path.exists(output_path):
        os.makedirs(output_path)

    # Iterate over each field in the data
    for field, importers in data.items():
        # Find the maximum number of entries for any importer in this field
        max_entries = max(len(values) for values in importers.values())
        
        # Create column headers for each entry: 'telefono_1', 'telefono_2', ..., 'telefono_n'
        headers = ['Importer'] + [f"{field}_{i+1}" for i in range(max_entries)]
        
        # Define the CSV filename based on the field name
        csv_filename = f"{field}.csv"
        csv_filepath = os.path.join(output_path, csv_filename)

        # Open the CSV file for writing
        with open(csv_filepath, 'w', newline='', encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            
            # Write the header row
            writer.writerow(headers)
            
            # Write the rows for each importer
            for importer, values in importers.items():
                # Pad the values with 'nan' if there are fewer values than the maximum
                padded_values = values + ['nan'] * (max_entries - len(values))
                writer.writerow([importer] + padded_values)
        
        print(f"CSV file for '{field}' saved to {csv_filepath}")

# Save each field as a separate CSV
save_as_csv(data, csv_output_path)

CSV file for 'telefono' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/telefono.csv
CSV file for 'agencia_de_transporte' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/agencia_de_transporte.csv
CSV file for 'email' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/email.csv
CSV file for 'direccion' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/direccion.csv
CSV file for 'descripcion_arancelaria' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/descripcion_arancelaria.csv
CSV file for 'pais_de_origen' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/pais_de_origen.csv
CSV file for 'us$_fob' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/us$_fob.csv
CSV file for 'agencia_de_carga' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/agencia_de_carga.csv
CSV file for 'modelo_mercaderia' saved to /home/luisvinatea/Data/Gdrive/BERAQUA/docs/datasets/csv/modelo_m