In [None]:
import pandas as pd
import numpy as np
import re

df_costs = pd.read_csv('/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/concat_2016.csv')

df_costs['fecha'] = pd.to_datetime(df_costs['fecha'])

def clean_data(df_costs):
    try:
        # Split text using string '_' in column: 'modelo_mercaderia'
        if 'modelo_mercaderia' in df_costs.columns:
            loc_0 = df_costs.columns.get_loc('modelo_mercaderia')
            df_costs_split = df_costs['modelo_mercaderia'].str.split(pat='_', expand=True, n=1).add_prefix('modelo_mercaderia_')
            
            # Concatenate the new columns and the original dataframe, dropping 'modelo_mercaderia'
            df_costs = pd.concat([df_costs.iloc[:, :loc_0], df_costs_split, df_costs.iloc[:, loc_0 + 1:]], axis=1)

            # Replace non-missing entries in modelo_mercaderia_0 with modelo_mercaderia_1
            if 'modelo_mercaderia_1' in df_costs.columns:
                df_costs['modelo_mercaderia_0'] = df_costs['modelo_mercaderia_1'].combine_first(df_costs['modelo_mercaderia_0'])

                # Drop 'modelo_mercaderia_1' column
                df_costs.drop(columns=['modelo_mercaderia_1'], inplace=True)
            
            # Rename 'modelo_mercaderia_0' to 'modelo'
            df_costs.rename(columns={'modelo_mercaderia_0': 'modelo'}, inplace=True)

        # Drop unnecessary columns if they exist
        columns_to_drop = [
            'regimen_aduanero', 'refrendo', 'item', 'probable_importador', 'pais_de_origen', 'pais_de_procedencia', 'ciudad_embarque', 'via_de_transporte', 'aduana', 'partida_arancelaria', 'descripcion_arancelaria', 'descripción_arancelaria', 
            'descripcion_producto_comercial', 'estado_de_mercancia', 'unidad_de_medida', 'embarcador', 
            'empresa_de_transporte', 'pais_de_embarque', 'país_de_embarque', 'país_de_origen', 'agente_de_aduana', 'nave', 'conocimiento_de_embarque', 'peso_neto_kg', 'tipo_aforo', 'dirección_consignatario', 'caracteristicas', 'producto', 'marca', 'año_fabricación', 'código_tnan', 'contenedor',
            'conocimiento_embarque', 'agencia_de_carga'
        ]

        # Only drop columns that exist in the dataframe
        existing_columns_to_drop = [col for col in columns_to_drop if col in df_costs.columns]
        df_costs.drop(columns=existing_columns_to_drop, inplace=True)

        # Calculate 'us$_fob_unit' by dividing 'us$_fob' by 'cantidad' or 'unidades'
        if 'us$_fob' in df_costs.columns:
            if 'cantidad' in df_costs.columns:
                df_costs['us$_fob_unit'] = df_costs.apply(
                    lambda row: row['us$_fob'] / row['cantidad'] if pd.notna(row['cantidad']) and row['cantidad'] != 0 else np.nan, 
                    axis=1
                )
            elif 'unidades' in df_costs.columns:
                df_costs['us$_fob_unit'] = df_costs.apply(
                    lambda row: row['us$_fob'] / row['unidades'] if pd.notna(row['unidades']) and row['unidades'] != 0 else np.nan, 
                    axis=1
                )

        # Replace NaN values in 'us$_fob_unit' with the mean of the column
        if 'us$_fob_unit' in df_costs.columns:
            mean_fob_unit = df_costs['us$_fob_unit'].mean(skipna=True)
            # Assigning back without inplace=True
            df_costs['us$_fob_unit'] = df_costs['us$_fob_unit'].fillna(mean_fob_unit)

        # Round 'us$_fob_unit' column
        if 'us$_fob_unit' in df_costs.columns:
            df_costs['us$_fob_unit'] = df_costs['us$_fob_unit'].round(2)

    except Exception as e:
        print(f"An error occurred during data cleaning: {e}")
    
    return df_costs

# Clean the data
df_costs_clean = clean_data(df_costs.copy())
df_costs_clean.head()


In [None]:
import pandas as pd
import numpy as np

file_paths = [
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2017.csv',
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2018.csv',
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2019.csv',
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2020.csv',
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2021.csv',
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2022.csv',
    '/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/processed/prospectos_importadores/concat/wrangled/aerators_wrangled_2023.csv'
]

# Read all the files into a list of DataFrames
dataframes = [pd.read_csv(file_path) for file_path in file_paths]

# Concatenate all DataFrames into one
merged_df = pd.concat(dataframes, ignore_index=True)

# Convert 'fecha' column to datetime
merged_df['fecha'] = pd.to_datetime(merged_df['fecha'])

# Set 'fecha' as the index and sort chronologically
merged_df = merged_df.set_index('fecha').sort_index()

# Filter dataset where 'us$_fob_unit' is greater than $500
filtered_df = merged_df[merged_df['us$_fob_unit'] > 500]

# Calculate the mean of 'us$_fob_unit' in the filtered dataset
mean_fob_unit = filtered_df['us$_fob_unit'].mean()

# Extract the rows where 'us$_fob_unit' is greater than the mean to create a mini dataset
mini_dataset = filtered_df[filtered_df['us$_fob_unit'] > mean_fob_unit]

# Create a new dataset excluding rows greater than the mean of 'us$_fob_unit'
df_aerator_data = filtered_df[filtered_df['us$_fob_unit'] <= mean_fob_unit]

print(df_aerator_data.head())

# Save df_aerator_data to a CSV file
df_aerator_data.to_csv('/home/luisvinatea/Data/Gdrive/aquaculture/beraqua/reports/aerator_data.csv', index=False)