In [2]:
import pandas as pd
import os

def create_value_map(data):
    """Create a mapping from unique string values to integer codes, excluding N/A values."""
    unique_values = data.dropna().unique()
    value_map = {val: idx for idx, val in enumerate(unique_values)}
    return value_map

def preprocess_column(data, value_map):
    """Convert string values to integers based on the value map, leaving N/A values as is."""
    return data.map(value_map)

def add_features_to_csv(input_csv, new_excel_file, output_csv, column_name_mapping):
    # Read the existing CSV file
    df_existing = pd.read_csv(input_csv)

    # Read the new Excel file
    df_new = pd.read_excel(new_excel_file)

    # Define the new columns to add and their processing types
    new_columns = {
        'Wetland Type - Provincial Class': 'Class',
        'Wetland Type - Federal Class': 'Class',
        'Water Regime Indicator': 'Class',
        'Specific Vegetation Type': 'Class',
        '% Vegetation Cover for Specific Vegetation Cover Types': 'Class',
        '% High Woody Canopy Cover (>5m)': 'Class',
        '% Moss Cover': 'Class',
        'Phragmites present (Y/N)': 'Class',
        'Soil Type': 'Class',
        '% of Surface Water Present': 'Class',
        'Depth of Saturation (cm)': 'Class',
        'Average Depth of Living Moss (cm)': 'Float',
        'Average Organic Depth (cm)': 'Float',
        'Hydrogeomorphic Class': 'Class'
    }

    preprocessed_data = df_new[['ELG_Site_ID']]

    # Preprocess each column in the new data
    for old_col, processing_type in new_columns.items():
        new_col = column_name_mapping.get(old_col, old_col)
        if processing_type == 'Class':
            value_map = create_value_map(df_new[old_col])
            preprocessed_data[new_col] = preprocess_column(df_new[old_col], value_map)
        elif processing_type == 'Float':
            preprocessed_data[new_col] = pd.to_numeric(df_new[old_col], errors='coerce')

    # Merge the new features based on the 'ELG_Site_ID' and 'id'
    df_merged = df_existing.merge(preprocessed_data, left_on='id', right_on='ELG_Site_ID', how='left')

    # Drop the redundant 'ELG_Site_ID' column
    df_merged.drop(columns=['ELG_Site_ID'], inplace=True)

    # Save the updated DataFrame to a new CSV file
    df_merged.to_csv(output_csv, index=False)

# Specify the paths to the files
input_csv_file = 'output.csv'
new_excel_file = 'new_data.xlsx'
output_csv_file = 'updated_output.csv'

# Define the column name mapping (old column name to new column name)
column_name_mapping = {
    'Wetland Type - Provincial Class': 'Provincial_Class',
    'Wetland Type - Federal Class': 'Federal_Class',
    'Water Regime Indicator': 'Regime',
    'Specific Vegetation Type': 'Vegetation_Type',
    '% Vegetation Cover for Specific Vegetation Cover Types': 'Vegetation_Cover',
    '% High Woody Canopy Cover (>5m)': 'Woody_Canopy_Cover',
    '% Moss Cover': 'Moss_Cover',
    'Phragmites present (Y/N)': 'Phragmites',
    'Soil Type': 'Soil_Type',
    '% of Surface Water Present': 'Surface_Water_Present',
    'Depth of Saturation (cm)': 'Saturation_Depth',
    'Average Depth of Living Moss (cm)': 'Living_Moss_Depth',
    'Average Organic Depth (cm)': 'Organic_Depth',
    'Hydrogeomorphic Class': 'Hydrogeomorphic_Class'
}



In [3]:

# Specify the paths to the files
input_csv_file = '2_output_reg_normalized_removed.csv'
new_excel_file = 'WESP_extra.xlsx'
output_csv_file = '3_output_reg_normalized_removed_extra.csv'

# Add features to the CSV
add_features_to_csv(input_csv_file, new_excel_file, output_csv_file, column_name_mapping)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_data[new_col] = preprocess_column(df_new[old_col], value_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_data[new_col] = preprocess_column(df_new[old_col], value_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_data[new_col] = preprocess_column(df_new[o