In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Função para dividir os dados em features (X) e target (y)
def split_data(df, target_column):
    """
    Split the DataFrame into features (X) and target (y).

    Parameters:
    - df (DataFrame): The input DataFrame.
    - target_column (str): The name of the target column.

    Returns:
    - X (DataFrame): Features (all columns except the target column).
    - y (Series): Target column.
    """
    # Verificar se a coluna alvo está presente no DataFrame
    if target_column not in df.columns:
        raise ValueError(f"A coluna alvo '{target_column}' não está presente no DataFrame.")
    
    # Separar X e y
    X = df.drop(target_column, axis=1)  # X são todas as colunas exceto a coluna alvo
    y = df[target_column]  # y é a coluna alvo
    
    return X, y

# Função para remover as colunas 'latitude' e 'longitude' de um DataFrame
def drop_lat_lon(df):
    """
    Remove 'latitude' and 'longitude' columns from the DataFrame.

    Parameters:
    - df (DataFrame): The input DataFrame.

    Returns:
    - new_df (DataFrame): DataFrame with 'latitude' and 'longitude' columns removed.
    """
    new_df = df.drop(['latitude', 'longitude'], axis=1)
    return new_df

# Função para obter as colunas do DataFrame que correspondem a um padrão regex
def get_bands(df, regex):
    """
    Get columns from the DataFrame that match the specified regex pattern.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - regex (str): Regular expression pattern.

    Returns:
    - new_df (DataFrame): DataFrame with columns matching the regex pattern.
    """
    new_df = df.filter(regex=regex)
    return new_df

# Função para obter as colunas de nutrientes de um DataFrame, excluindo as colunas de bandas
def get_nutrients(df, bands):
    """
    Get nutrient columns from the DataFrame, excluding the specified band columns.

    Parameters:
    - df (DataFrame): The input DataFrame.
    - bands (DataFrame): DataFrame with band columns.

    Returns:
    - new_df (DataFrame): DataFrame with nutrient columns.
    """
    new_df = df.drop(bands.columns, axis=1)
    return new_df



def calculate_select_k_best(X, y, k=10):
    """
    Calculate feature importance using SelectKBest.

    Parameters:
    - X (DataFrame): Features.
    - y (Series): Target variable.
    - k (int): Number of top features to select.

    Returns:
    - feature_importance (array): Array containing the feature scores.
    """
    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)
    feature_importance = selector.scores_
    return feature_importance

# Função para calcular a importância das features usando Regressão Linear
def calculate_linear_regression(X, y):
    """
    Calculate feature importance using Linear Regression.

    Parameters:
    - X (DataFrame): Features.
    - y (Series): Target variable.

    Returns:
    - feature_importance (array): Array containing the absolute coefficients of the linear model.
    """
    model = LinearRegression()
    model.fit(X, y)
    feature_importance = np.abs(model.coef_)
    return feature_importance

# Função para calcular a importância das features usando Random Forest
def calculate_random_forest(X, y):
    """
    Calculate feature importance using Random Forest.

    Parameters:
    - X (DataFrame): Features.
    - y (Series): Target variable.

    Returns:
    - feature_importance (array): Array containing the feature importances from the Random Forest model.
    """
    model = RandomForestRegressor()
    model.fit(X, y)
    feature_importance = model.feature_importances_
    return feature_importance

# Função para criar um DataFrame com a importância das features
def create_importance_df(feature_importance, feature_names, method):
    """
    Create a DataFrame containing the feature importance scores.

    Parameters:
    - feature_importance (array): Array containing the feature importance scores.
    - feature_names (Index or list): Names of the features.
    - method (str): Name of the method used for feature importance calculation.

    Returns:
    - importance_df (DataFrame): DataFrame with feature importance scores for the specified method.
    """
    importance_df = pd.DataFrame({feature_names[i]: [feature_importance[i]] for i in range(len(feature_names))})
    importance_df.index = [method]
    return importance_df

# Função para calcular a importância das features usando diferentes métodos
def calculate_feature_importance(X, y, methods=['select_k_best', 'linear_regression', 'random_forest'], k=10, scaler=False):
    """
    Calculate feature importance using multiple methods and consolidate the results.

    Parameters:
    - X (DataFrame): Features.
    - y (Series): Target variable.
    - methods (list): List of methods to use for feature importance calculation.
    - k (int): Number of top features to select in the case of SelectKBest.
    - scaler (bool): Whether to normalize the data using StandardScaler.

    Returns:
    - consolidated_df (DataFrame): DataFrame containing the consolidated feature importance scores.
    """
    results = {}

    # Normalize os dados com o StandardScaler, se necessário
    if scaler:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    # Calculando a importância para cada método
    for method in methods:
        if method == 'select_k_best':
            feature_importance = calculate_select_k_best(X, y, k)
        elif method == 'linear_regression':
            feature_importance = calculate_linear_regression(X, y)
        elif method == 'random_forest':
            feature_importance = calculate_random_forest(X, y)
        else:
            raise ValueError("Método não suportado. Escolha entre 'select_k_best', 'linear_regression' ou 'random_forest'.")

        # Criando DataFrame com a importância das features
        feature_names = X.columns
        importance_df = create_importance_df(feature_importance, feature_names, method)

        # Adicionando ao dicionário de resultados
        results[method] = importance_df

    # Concatenando DataFrames de diferentes métodos
    consolidated_df = pd.concat(list(results.values()), axis=0)

    return consolidated_df.T

def process_multiple_targets(X_df, y_df, methods=['select_k_best', 'linear_regression', 'random_forest'], k=10):
    """
    Process multiple targets and calculate feature importance for each target.

    Parameters:
    - X_df (DataFrame): Features DataFrame.
    - y_df (DataFrame): Target DataFrame with multiple columns.
    - methods (list): List of methods to use for feature importance calculation.
    - k (int): Number of top features to select in the case of SelectKBest.

    Returns:
    - consolidated_df (DataFrame): DataFrame containing consolidated feature importance scores for each target.
    """
    results_dict = {}

    # Iterando sobre cada coluna alvo (Y) no DataFrame y_df
    for y_column in y_df.columns:
        # Fazendo o split para cada coluna Y
        X = X_df
        y = y_df[y_column]

        # Calculando a importância das features para a coluna Y
        feature_importance_df = calculate_feature_importance(X, y, methods, k)

        # Adicionando resultados ao dicionário, usando a coluna Y como chave
        results_dict[y_column] = feature_importance_df

    # Criando DataFrame consolidado com os resultados, usando as colunas Y como níveis de coluna
    consolidated_df = pd.concat(list(results_dict.values()), keys=results_dict.keys(), axis=1)

    return consolidated_df

# Função para plotar gráficos de importância de features usando Plotly
def feature_importance_plot(df, title="Feature Importance"):
    """
    Plot feature importance using Plotly.

    Parameters:
    - df (DataFrame): DataFrame containing feature importance scores.
    - title (str): Title for the plot.

    Returns:
    - None (displays the plot).
    """
    num_columns = len(df.columns.levels[0])
    num_methods = len(df.columns.levels[1])  # Número de métodos (sub-colunas)
    num_rows = num_columns  # Número total de linhas

    # Criar figura com subplots e escalas de eixos y independentes
    fig = make_subplots(rows=num_rows, cols=num_methods, subplot_titles=[
        f"{col_level0} - {col_level1}" for col_level0 in df.columns.levels[0] for col_level1 in df.columns.levels[1]
    ])

    for i, col_level0 in enumerate(df.columns.levels[0]):
        for j, col_level1 in enumerate(df.columns.levels[1]):
            idx = i + 1

            # Criar bar plot usando plotly.graph_objects
            bar_plot = go.Bar(x=df.index, y=df[col_level0, col_level1], showlegend=False)

            # Adicionar bar plot ao subplot correspondente
            fig.add_trace(bar_plot, row=idx, col=j + 1)

            # Atualizar título do subplot
            title_text = f'{col_level0} - {col_level1}' if j == 0 else ''  # Apenas na primeira coluna
            fig.update_xaxes(title_text=title_text, row=idx, col=j + 1)

    # Atualizar layout
    fig.update_layout(height=400 * num_rows, title_text=title)

    # Mostrar a figura
    fig.show()



# Talhão 1

In [2]:
map_name = 'map1'

filepath = f'data/{map_name}/interpolation/universal_interpolated_df.csv'
target_columns = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Mn', 'Zn', 'Mo', 'Ni']

map1 = pd.read_csv(filepath)

map1 = drop_lat_lon(map1)
bands = get_bands(map1, '.tiff')
nutrients = get_nutrients(map1, bands)
nutrients = nutrients[target_columns]

map1_feature_importance_df = process_multiple_targets(bands, nutrients, methods=['select_k_best', 'linear_regression'], k=10)
feature_importance_plot(map1_feature_importance_df, title="Feature Importance - Map 1")


# Talhão 2

In [3]:
map_name = 'map2'

filepath = f'data/{map_name}/interpolation/universal_interpolated_df.csv'
target_columns = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Mn', 'Zn', 'Mo', 'Ni']

map2 = pd.read_csv(filepath)

map2 = drop_lat_lon(map2)
bands = get_bands(map2, '.tiff')
nutrients = get_nutrients(map2, bands)
nutrients = nutrients[target_columns]

map2_feature_importance_df = process_multiple_targets(bands, nutrients, methods=['select_k_best', 'linear_regression'], k=10)
feature_importance_plot(map2_feature_importance_df, title="Feature Importance - Map 2")
