1. Loading The Data

In [41]:
import pandas as pd

def load_data(file_paths):
    """
    Load data from specified file paths into pandas DataFrames.
    
    Args:
    file_paths (dict): A dictionary where keys are descriptive names of the data and values are the file paths.
    
    Returns:
    dict: A dictionary containing loaded dataframes.
    """
    return {name: pd.read_csv(path) for name, path in file_paths.items()}

def calculate_missing_values(dataframes):
    """
    Calculate and print the percentage of missing values for each dataframe.
    
    Args:
    dataframes (dict): A dictionary of pandas DataFrames.
    """
    missing_values_summary = {
        name: (df.isnull().sum() / len(df) * 100).sort_values(ascending=False)
        for name, df in dataframes.items()
    }
    
    for dataset_name, missing_values in missing_values_summary.items():
        if not missing_values.empty:
            print(f"Missing Values in {dataset_name} Dataset:\n{missing_values}\n")
        else:
            print(f"No missing values in {dataset_name} Dataset.\n")

# Example Usage
data_files = {
    "cycles": "data/cycles.csv",
    "farms": "data/farms.csv",
    "fasting": "data/fasting.csv",
    "fastings": "data/fastings.csv",
    "feed_tray": "data/feed_tray.csv",
    "feeds": "data/feeds.csv",
    "harvests": "data/harvests.csv",
    "measurements": "data/measurements.csv",
    "mortalities": "data/mortalities.csv",
    "ponds": "data/ponds.csv",
    "samplings": "data/samplings.csv"
}

dataframes = load_data(data_files)
calculate_missing_values(dataframes)


Missing Values in cycles Dataset:
ordered_at                58.196408
remark                    48.949178
species_id                31.104318
hatchery_id               17.768437
hatchery_name             17.768437
total_seed_type            9.247230
pond_depth                 4.508980
initial_age                1.834161
limit_weight_per_area      0.267482
pond_length                0.229270
pond_width                 0.229270
target_size                0.152847
target_cultivation_day     0.114635
finished_at                0.038212
id                         0.000000
total_seed                 0.000000
created_at                 0.000000
started_at                 0.000000
pond_id                    0.000000
updated_at                 0.000000
subscription_type          0.000000
extracted_at               0.000000
area                       0.000000
pond_name                  0.000000
dtype: float64

Missing Values in farms Dataset:
regency     16.878403
province    13.067151
id       

2. Calculate Survival Rate

In [42]:
def calculate_survival_rate(dataframe):
    """
    Calculate the Survival Rate (SR) for shrimp cycles.
    
    Args:
    dataframe (pd.DataFrame): DataFrame containing columns 'total_harvested_shrimp' and 'total_seed'.
    
    Returns:
    pd.DataFrame: DataFrame with an additional 'SR' column.
    """
    dataframe['SR'] = (dataframe['total_harvested_shrimp'] / dataframe['total_seed']) * 100
    return dataframe


3. Calculating Cycle Age

In [43]:
def calculate_cycle_age(dataframe, start_date_col, end_date_col):
    """
    Calculate the cycle age in days.
    
    Args:
    dataframe (pd.DataFrame): DataFrame with cycle start and end dates.
    start_date_col (str): Column name for the start date of the cycle.
    end_date_col (str): Column name for the end date of the cycle.
    
    Returns:
    pd.DataFrame: DataFrame with an additional 'cycle_age_days' column.
    """
    dataframe['cycle_age_days'] = (pd.to_datetime(dataframe[end_date_col]) - pd.to_datetime(dataframe[start_date_col])).dt.days
    return dataframe


4. Calculating Mean Measurement for Environmental Factors

In [44]:
def calculate_mean_measurements(dataframe, measurement_cols):
    """
    Calculate mean measurements for specified columns.
    
    Args:
    dataframe (pd.DataFrame): DataFrame containing measurement data.
    measurement_cols (list of str): List of column names to calculate the mean for.
    
    Returns:
    pd.DataFrame: DataFrame with mean values calculated for specified columns.
    """
    for col in measurement_cols:
        mean_col_name = f"mean_{col}"
        dataframe[mean_col_name] = dataframe[col].mean()
    return dataframe


5. Calculating Total Shrimp and Total Weight

In [46]:
def calculate_harvest_totals(harvests_df):
    """
    Calculate the total number of harvested shrimp and total weight, then aggregate by cycle_id.
    
    Args:
    harvests_df (pd.DataFrame): DataFrame containing harvest data.
    
    Returns:
    pd.DataFrame: DataFrame with aggregated harvest totals and total weight by cycle.
    """
    # Calculate the total harvested shrimp by multiplying weight and size
    harvests_df['total_harvested_shrimp'] = harvests_df['weight'] * harvests_df['size']
    
    # Assuming total weight is just the sum of weights - modify this as necessary
    harvests_df['total_weight'] = harvests_df['weight']
    
    # Aggregate the harvests data by cycle_id for total harvested shrimp and total weight
    aggregated_harvests_df = harvests_df.groupby('cycle_id').agg({
        'total_harvested_shrimp': 'sum',
        'total_weight': 'sum'
    }).reset_index()
    
    return aggregated_harvests_df

# Example Usage
harvests_df = pd.read_csv("data/harvests.csv")
harvest_totals = calculate_harvest_totals(harvests_df)
final_data = merge_cycle_harvest_data(cleaned_cycles, harvest_totals)

# Assuming 'dataframes' contains all your relevant datasets
harvests_df = calculate_survival_rate(harvests_df)
cycles_df = calculate_cycle_age(cycles_df, 'start_date', 'end_date')
environmental_df = calculate_mean_measurements(environmental_df, ['temperature', 'salinity', 'pH'])

# Continue with data preparation and model training...


KeyError: 'total_seed'

In [None]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2490 entries, 0 to 2489
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      2490 non-null   int64  
 1   pond_id                 2490 non-null   int64  
 2   species_id              1700 non-null   float64
 3   total_seed              2490 non-null   int64  
 4   started_at              2490 non-null   object 
 5   finished_at             2490 non-null   object 
 6   remark                  1328 non-null   object 
 7   created_at              2490 non-null   object 
 8   updated_at              2490 non-null   object 
 9   area                    2490 non-null   float64
 10  initial_age             2443 non-null   float64
 11  limit_weight_per_area   2483 non-null   float64
 12  target_cultivation_day  2487 non-null   float64
 13  target_size             2486 non-null   float64
 14  extracted_at            2490 non-null   

In [None]:
import pandas as pd

def prepare_model_data(dataframe, fill_strategy='median'):
    """
    Prepare the data for modeling by filling missing values according to the specified strategy and selecting relevant features.
    
    Args:
    dataframe (pd.DataFrame): DataFrame to be prepared.
    fill_strategy (str): Strategy to fill missing values ('median', 'mean', etc.).
    
    Returns:
    pd.DataFrame: DataFrame ready for modeling, or None if critical errors are found.
    """
    features = ["cycle_age_days", "total_seed", "area", "total_shrimp", "total_weight",
                "feed_quantity", "morning_temperature", "evening_temperature", "morning_do", 
                "evening_do", "morning_salinity", "evening_salinity", "morning_pH", "evening_pH",
                "nitrate", "nitrite", "alkalinity", "SR"]  # Including 'SR' in the features list

    # Check for missing columns
    missing_columns = [col for col in features if col not in dataframe.columns]
    if missing_columns:
        print(f"Error: Missing columns in the dataframe - {missing_columns}")
        return None  # or handle differently based on your needs

    # Fill missing values
    numeric_cols = dataframe.select_dtypes(include=[np.number]).columns.tolist()
    if fill_strategy == 'median':
        dataframe[numeric_cols] = dataframe[numeric_cols].fillna(dataframe[numeric_cols].median())
    elif fill_strategy == 'mean':
        dataframe[numeric_cols] = dataframe[numeric_cols].fillna(dataframe[numeric_cols].mean())

    # Select features
    dataframe = dataframe[features]

    return dataframe

# Example Usage
# Assuming 'final_data' is already loaded and is the raw DataFrame
prepared_data = prepare_model_data(final_data)

if prepared_data is not None:
    print("Data prepared successfully.")
else:
    print("Failed to prepare data.")


Error: Missing columns in the dataframe - ['cycle_age_days', 'total_shrimp', 'feed_quantity', 'morning_temperature', 'evening_temperature', 'morning_do', 'evening_do', 'morning_salinity', 'evening_salinity', 'morning_pH', 'evening_pH', 'nitrate', 'nitrite', 'alkalinity', 'SR']
Failed to prepare data.
