In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio
import plotly.figure_factory as ff
import plotly.offline as pyo
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import geopandas as gpd

from pptx import Presentation
from pptx.util import Inches, Pt
import os


import warnings
warnings.filterwarnings('ignore')

In [2]:
map_name = 'map2' ## map2 = talhão 2. map1 = talhão 1

original_path = f'data/{map_name}/nutrients/nutrients.csv'

uk_interpolation_path = f'data/{map_name}/interpolation/universal_interpolation_df.csv'
uk_closest_interpolation_path = f'data/{map_name}/closest_points/universal_interpolation_closest_points.csv'
linear_interpolation_path = f'data/{map_name}/interpolation/linear_interpolation_df.csv'
linear_closest_points_path = f'data/{map_name}/closest_points/linear_interpolation_closest_points.csv'
nearest_neighbors_interpolation_path = f'data/{map_name}/interpolation/nearest_neighbors_interpolation_df.csv'
nearest_neighbors_closest_path = f'data/{map_name}/closest_points/nearest_neighbors_interpolation_closest_points.csv'


lr_errors_path = f'data/{map_name}/errors/linear_regression_error.csv'
rf_errors_path = f'data/{map_name}/errors/random_forest_error.csv'
nn_errors_path = f'data/{map_name}/errors/keras_error.csv'

columns_of_interest = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe',
                      'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N',
                      'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu',
                      'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']

coordinates = ['latitude', 'longitude']


# Dataframes

In [3]:
uk_interpolation = pd.read_csv(uk_interpolation_path)
linear_interpolation = pd.read_csv(linear_interpolation_path)
nearest_neighbors_interpolation = pd.read_csv(nearest_neighbors_interpolation_path)

lr_errors = pd.read_csv(lr_errors_path)
rf_errors = pd.read_csv(rf_errors_path)
nn_errors = pd.read_csv(nn_errors_path)

original_df = pd.read_csv(original_path)
uk_closest_points_df = pd.read_csv(uk_closest_interpolation_path)
linear_closest_points_df = pd.read_csv(linear_closest_points_path)
nearest_neighbors_closest_points_df = pd.read_csv(nearest_neighbors_closest_path)

In [4]:
def remove_useless_column(df):
    # Check if 'Unnamed: 0' column exists, and drop it if present
    if 'Unnamed: 0' in df.columns:
        df = df.drop(columns=['Unnamed: 0'], axis=1)

    # Check if 'Amostra' column exists, and drop it if present
    if 'Amostra' in df.columns:
        df = df.drop(columns=['Amostra'], axis=1)

    # Check if 'Local' column exists, and drop it if present
    if 'Local' in df.columns:
        df = df.drop(columns=['Local'], axis=1)

    # Check if 'Lat' column exists, and drop it if present
    if 'Lat' in df.columns:
        df = df.drop(columns=['Lat'], axis=1)

    # Check if 'Long' column exists, and drop it if present
    if 'Long' in df.columns:
        df = df.drop(columns=['Long'], axis=1)

    return df

# Function to add latitude and longitude columns from another DataFrame
def add_latitude_and_longitude(df, coords_df):
    # Check if 'latitude' and 'longitude' columns are not already present
    if 'latitude' not in df.columns and 'longitude' not in df.columns:
        # Merge the DataFrames on their indices to add 'latitude' and 'longitude' columns
        df = pd.merge(df, coords_df[['latitude', 'longitude']], left_index=True, right_index=True)

    return df

    

uk_interpolation = remove_useless_column(uk_interpolation)
uk_closest_points_df = remove_useless_column(uk_closest_points_df)

linear_interpolation = remove_useless_column(linear_interpolation)
linear_closest_points_df = remove_useless_column(linear_closest_points_df)

nearest_neighbors_interpolation = remove_useless_column(nearest_neighbors_interpolation)
nearest_neighbors_closest_points_df = remove_useless_column(nearest_neighbors_closest_points_df)
original_df = remove_useless_column(original_df)

lr_errors = remove_useless_column(lr_errors)
rf_errors = remove_useless_column(rf_errors)
nn_errors = remove_useless_column(nn_errors)


# Estatísticas

In [5]:
from scipy.stats import t

def describe_data(dataframe, confidence_level=0.95):
    # Percentis solicitados
    percentiles = [0.25, 0.5, 0.75, 0.8, 0.85, 0.9, 0.95, 0.99]

    # Calcular estatísticas descritivas
    desc_stats_df = dataframe.describe(percentiles=percentiles).T

    # Adicionar mais estatísticas descritivas
    desc_stats_df['range'] = desc_stats_df['max'] - desc_stats_df['min']
    desc_stats_df['coef_of_variation'] = desc_stats_df['std'] / desc_stats_df['mean']  # Coeficiente de variação
    desc_stats_df['iqr'] = desc_stats_df['75%'] - desc_stats_df['25%']  # Amplitude interquartil
    desc_stats_df['skewness'] = dataframe.skew()  # Assimetria
    desc_stats_df['kurtosis'] = dataframe.kurtosis()  # Curtose

    # Calcular o intervalo de confiança para as médias
    num_samples = dataframe.count()
    degrees_freedom = num_samples - 1
    margin_of_error = t.ppf((1 + confidence_level) / 2, degrees_freedom) * desc_stats_df['std'] / np.sqrt(num_samples)
    desc_stats_df['confidence_interval_low'] = desc_stats_df['mean'] - margin_of_error
    desc_stats_df['confidence_interval_high'] = desc_stats_df['mean'] + margin_of_error

    return desc_stats_df.T


In [6]:
stats_lr = describe_data(lr_errors)
stats_rf = describe_data(rf_errors)
stats_nn = describe_data(nn_errors)

# save slice_stats_lr to csv and save index
stats_lr.to_csv(f'data/{map_name}/errors/linear_regression_error_stats.csv', index=True)
stats_rf.to_csv(f'data/{map_name}/errors/random_forest_error_stats.csv', index=True)
stats_nn.to_csv(f'data/{map_name}/errors/keras_error_stats.csv', index=True)

original_stats = describe_data(original_df)
uk_interpolation_stats = describe_data(uk_interpolation)
uk_closest_points_stats = describe_data(uk_closest_points_df)
linear_interpolation_stats = describe_data(linear_interpolation)
linear_interpolation_closest_points_stats = describe_data(linear_closest_points_df)
nearest_neighbors_stats = describe_data(nearest_neighbors_interpolation)
nearest_neighbors_interpolation_closest_points_stats = describe_data(nearest_neighbors_closest_points_df)


uk_interpolation_stats.to_csv(f'data/{map_name}/interpolation/universal_interpolation_stats.csv', index=True)
uk_closest_points_stats.to_csv(f'data/{map_name}/closest_points/universal_interpolation_closest_points_stats.csv', index=True)
original_stats.to_csv(f'data/{map_name}/nutrients/nutrients_stats.csv', index=True)
linear_interpolation_stats.to_csv(f'data/{map_name}/interpolation/linear_interpolation_stats.csv', index=True)
linear_interpolation_closest_points_stats.to_csv(f'data/{map_name}/closest_points/linear_interpolation_closest_points_stats.csv', index=True)
nearest_neighbors_stats.to_csv(f'data/{map_name}/interpolation/nearest_neighbors_interpolation_stats.csv', index=True)
nearest_neighbors_interpolation_closest_points_stats.to_csv(f'data/{map_name}/closest_points/nearest_neighbors_interpolation_closest_points_stats.csv', index=True)




In [7]:
stats_lr = remove_useless_column(stats_lr)
stats_rf = remove_useless_column(stats_rf)
stats_nn = remove_useless_column(stats_nn)
original_stats = remove_useless_column(original_stats)
uk_interpolation_stats = remove_useless_column(uk_interpolation_stats)
uk_closest_points_stats = remove_useless_column(uk_closest_points_stats)

In [8]:
lr_errors = add_latitude_and_longitude(lr_errors, uk_interpolation)
rf_errors = add_latitude_and_longitude(rf_errors, uk_interpolation)
nn_errors = add_latitude_and_longitude(nn_errors, uk_interpolation)
original_df = add_latitude_and_longitude(original_df, uk_interpolation)
uk_interpolation = add_latitude_and_longitude(uk_interpolation, uk_interpolation)
uk_closest_points_df = add_latitude_and_longitude(uk_closest_points_df, uk_interpolation)
linear_interpolation = add_latitude_and_longitude(linear_interpolation, uk_interpolation)
linear_closest_points_df = add_latitude_and_longitude(linear_closest_points_df, uk_interpolation)
nearest_neighbors_interpolation = add_latitude_and_longitude(nearest_neighbors_interpolation, uk_interpolation)
nearest_neighbors_closest_points_df = add_latitude_and_longitude(nearest_neighbors_closest_points_df, uk_interpolation)




In [9]:
# save nn_stats to a excel file
stats_nn = remove_useless_column(stats_nn)
stats_nn.to_excel(f'data/{map_name}/errors/keras_errors_stats.xlsx')

stats_nn

Unnamed: 0,N,P,K,Ca,Mg,S,B,Cu,Fe,Mn,...,DRIS_Ca,DRIS_Mg,DRIS_S,DRIS_B,DRIS_Cu,DRIS_Fe,DRIS_Mn,DRIS_Zn,IMS,IBN
count,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,...,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0,9349.0
mean,0.11279,0.066815,0.086014,0.04409139,0.042231,0.05587,0.060671,0.094836,0.335606,0.079089,...,1.538431,1.923103,0.869265,1.546382,0.443274,1.033533,0.301607,0.764715,1.170072,0.533275
std,0.087117,0.050448,0.057864,0.03244591,0.029385,0.041144,0.043503,0.079077,0.226033,0.056871,...,3.263677,4.102636,2.348719,3.690627,1.401465,0.871497,0.537303,1.64316,2.442733,1.790795
min,4e-06,1e-05,4e-06,1.994613e-07,6e-06,2e-06,1.6e-05,2e-05,2e-05,1.1e-05,...,0.000239,0.000329,6.9e-05,0.00013,4.2e-05,0.000302,8.2e-05,5.8e-05,3.1e-05,1.9e-05
25%,0.045127,0.025842,0.047614,0.01752335,0.019205,0.022885,0.025348,0.0336,0.15084,0.031413,...,0.222206,0.24682,0.138792,0.214967,0.067441,0.442322,0.08301,0.145565,0.153811,0.071289
50%,0.094718,0.056202,0.08156,0.03694586,0.038976,0.045912,0.053946,0.068147,0.310799,0.07354,...,0.535242,0.488844,0.320751,0.336467,0.125679,0.775977,0.173386,0.307459,0.284183,0.14267
75%,0.146444,0.096853,0.107212,0.06640421,0.05923,0.08634,0.089444,0.142716,0.47252,0.113456,...,1.330485,0.937343,0.54632,0.864123,0.213255,1.37349,0.285007,0.54624,0.597628,0.215623
80%,0.166266,0.108115,0.117417,0.07431547,0.064363,0.097278,0.100008,0.171238,0.520007,0.12146,...,1.734495,1.193707,0.605234,1.356288,0.241227,1.581492,0.317609,0.683274,0.861208,0.23992
85%,0.223595,0.118885,0.136783,0.08397794,0.070279,0.106378,0.110287,0.193474,0.581691,0.132059,...,2.18363,1.438313,0.693463,1.878648,0.27444,1.875412,0.366516,0.962735,2.694041,0.276605
90%,0.269154,0.141925,0.154115,0.09326849,0.080243,0.116373,0.120836,0.230732,0.632845,0.148762,...,2.533245,5.894706,0.910355,3.211808,0.337025,2.226585,0.453975,1.25063,3.312296,0.345798


# Histogramas


In [10]:
import random
import os

def random_color():
    """
    Generates a random hex color code.
    """
    return f"#{random.randint(0, 0xFFFFFF):06x}"

def plot_two_histograms(df1, df2, output_dir, legend_label1="Interpolation", legend_label2="Original"):
    """
    Plots two histograms for each pair of columns in two DataFrames and saves them in the specified output directory.
    
    Parameters:
    - df1: DataFrame containing the first set of data.
    - df2: DataFrame containing the second set of data.
    - output_dir: Directory where the plots will be saved.
    - legend_label1: Legend label for the first DataFrame.
    - legend_label2: Legend label for the second DataFrame.
    """
    np.random.seed(42)

    # Loop through each pair of columns in the DataFrames
    for i, (column1, column2) in enumerate(zip(df1.columns, df2.columns)):
        # Create a directory for each column
        column_dir = os.path.join(output_dir, column1)
        os.makedirs(column_dir, exist_ok=True)

        # Full file path for the plot
        file_path = os.path.join(column_dir, f'{column1}.png')

        # Generate random colors for each variable
        color1 = random_color()
        color2 = random_color()

        # Create a new plot
        plt.figure(figsize=(10, 6))

        # Plot histograms using Seaborn for the first DataFrame
        sns.histplot(df1[column1], kde=True, color=color1, label=legend_label1)

        # Plot histograms using Seaborn for the second DataFrame
        sns.histplot(df2[column2], kde=True, color=color2, label=legend_label2)

        # Add title
        plt.title(f'{column1} - {legend_label1} vs {column2} - {legend_label2}', fontsize=14, fontweight='bold', color='black')

        # Add legend
        plt.legend()

        # Save the figure
        plt.savefig(file_path)

        # Close the plot to avoid overlap
        plt.close()

def plot_histograms_separate(dataframe, output_base_directory):
    """
    Plots histograms for each variable in a DataFrame and saves them in separate directories.
    
    Parameters:
    - dataframe: DataFrame containing the data.
    - output_base_directory: Base directory where individual variable directories will be created.
    """
    # Get the list of variables to be plotted
    variables_to_plot = [col for col in dataframe.columns]

    print(f'Variables to be plotted: {variables_to_plot}')

    # Loop through each variable in the list
    for variable in variables_to_plot:
        # Create the output directory for the current variable
        output_directory = os.path.join(output_base_directory, variable)
        os.makedirs(output_directory, exist_ok=True)

        # Set up the subplot layout
        fig, ax = plt.subplots(figsize=(8, 5))

        # Create a histogram
        sns.histplot(data=dataframe, x=variable, kde=True, color='skyblue', bins=30, ax=ax)

        # Set subplot title and labels for the histogram
        ax.set_title(f'Histogram for {variable}')
        ax.set_xlabel(variable)
        ax.set_ylabel('Frequency')

        # Adjust layout to prevent overlapping
        plt.tight_layout()

        # Save the plot to the output directory with the specified title
        plot_filename = os.path.join(output_directory, f'{variable}.png')
        plt.savefig(plot_filename)

        # Close the current plot to avoid overlapping when creating the next one
        plt.close()



In [11]:
print(uk_closest_points_df.columns)
print(original_df.columns)

Index(['latitude', 'longitude', 'B01.tiff', 'B02.tiff', 'B03.tiff', 'B04.tiff',
       'B05.tiff', 'B06.tiff', 'B07.tiff', 'B08.tiff', 'B8A.tiff', 'B09.tiff',
       'B11.tiff', 'B12.tiff', 'N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe',
       'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N',
       'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu',
       'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN'],
      dtype='object')
Index(['latitude', 'longitude', 'N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu',
       'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N',
       'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu',
       'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN'],
      dtype='object')


In [12]:
uk_interpolation[columns_of_interest]

Unnamed: 0,N,P,K,Ca,Mg,S,B,Cu,Fe,Mn,...,DRIS_Ca,DRIS_Mg,DRIS_S,DRIS_B,DRIS_Cu,DRIS_Fe,DRIS_Mn,DRIS_Zn,IMS,IBN
0,46.128,5.203180,16.835147,7.674,2.631906,1.836236,29.788514,9.007810,126.0795,85.692006,...,-0.447422,-0.425223,-1.411327,-0.614796,-1.905545,2.114790,1.2325,0.005562,0.4895,12.322396
1,46.128,5.200742,16.818184,7.674,2.631391,1.835065,29.817052,9.009184,126.0795,85.751867,...,-0.444671,-0.424845,-1.397361,-0.615114,-1.970325,2.083786,1.2325,0.122104,0.4895,12.321371
2,46.128,5.198601,16.802442,7.674,2.630920,1.833759,29.844696,9.010067,126.0795,85.807593,...,-0.441563,-0.424148,-1.382734,-0.615007,-2.033757,2.051383,1.2325,0.230563,0.4895,12.319213
3,46.128,5.196839,16.788264,7.674,2.630534,1.832361,29.872312,9.010357,126.0795,85.858042,...,-0.438131,-0.423107,-1.367610,-0.614452,-2.095219,2.017724,1.2325,0.329324,0.4895,12.315704
4,46.128,5.195524,16.775950,7.674,2.630279,1.830911,29.901034,9.009967,126.0795,85.902179,...,-0.434434,-0.421709,-1.352120,-0.613425,-2.154117,1.982894,1.2325,0.417154,0.4895,12.310706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9344,46.128,5.058060,17.159832,7.674,2.654245,1.802529,30.372621,9.178062,126.0795,86.392908,...,-0.527712,-0.325758,-1.378083,-0.644710,-3.022409,2.025470,1.2325,0.525307,0.4895,12.290992
9345,46.128,5.053368,17.145949,7.674,2.656452,1.801483,30.372854,9.179849,126.0795,86.399384,...,-0.527793,-0.325810,-1.381325,-0.645208,-3.030510,2.026048,1.2325,0.532761,0.4895,12.290604
9346,46.128,5.048595,17.128906,7.674,2.659026,1.800416,30.372321,9.181624,126.0795,86.406896,...,-0.528011,-0.325780,-1.384653,-0.645908,-3.039682,2.026265,1.2325,0.541121,0.4895,12.290291
9347,46.128,5.043893,17.109109,7.674,2.661857,1.799293,30.370827,9.183275,126.0795,86.415160,...,-0.528181,-0.325666,-1.387952,-0.646655,-3.049820,2.025823,1.2325,0.550398,0.4895,12.290027


In [14]:

# histograma closest points vs original
plot_two_histograms(uk_closest_points_df[columns_of_interest], original_df[columns_of_interest], f'images/{map_name}/histograms/uk/comparing_interpolation_and_original')



In [15]:
plot_two_histograms(nearest_neighbors_closest_points_df[columns_of_interest], original_df[columns_of_interest], f'images/{map_name}/histograms/nearest_neighbors/comparing_interpolation_and_original')


In [16]:

# histograma interpolation
plot_histograms_separate(uk_interpolation[columns_of_interest], f'images/{map_name}/histograms/uk/interpolation')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [17]:
plot_histograms_separate(nearest_neighbors_interpolation[columns_of_interest], f'images/{map_name}/histograms/nearest_neighbors/interpolation')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [18]:
# histograma linear regression errors
plot_histograms_separate(lr_errors[columns_of_interest], f'images/{map_name}/histograms/ape/lr')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [19]:
# histograma random forest errors
plot_histograms_separate(rf_errors[columns_of_interest], f'images/{map_name}/histograms/ape/rf')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [20]:
plot_histograms_separate(nn_errors[columns_of_interest], f'images/{map_name}/histograms/ape/nn')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


# Heatmaps

In [21]:
def kernel_density_maps_separate(dataframe, latitude_col, longitude_col, output_base_directory):
    """
    Plots kernel density maps and scatter plots for each variable in a DataFrame and saves them in separate directories.
    
    Parameters:
    - dataframe: DataFrame containing the data.
    - latitude_col: Column name for latitude information.
    - longitude_col: Column name for longitude information.
    - output_base_directory: Base directory where individual variable directories will be created.
    """
    # Create a GeoDataFrame from the DataFrame
    gdf = gpd.GeoDataFrame(dataframe, geometry=gpd.points_from_xy(dataframe[longitude_col], dataframe[latitude_col]))

    # Get the list of variables to be plotted
    variables_to_plot = [col for col in dataframe.columns if col not in [latitude_col, longitude_col]]

    print(f'Variables to be plotted: {variables_to_plot}')

    # Loop through each variable in the list
    for variable in variables_to_plot:
        # Create the output directory for the current variable
        output_directory = os.path.join(output_base_directory, variable)
        os.makedirs(output_directory, exist_ok=True)

        # Set up the subplot layout
        fig, ax = plt.subplots(figsize=(8, 5))

        # Create a kernel density plot
        sns.kdeplot(data=gdf, x=longitude_col, y=latitude_col, fill=True, cmap="Blues", levels=30, thresh=0, gridsize=100, ax=ax)
        
        # Scatter plot with variable color
        sns.scatterplot(data=gdf, x=longitude_col, y=latitude_col, hue=variable, palette="viridis", alpha=0.7, s=50, ax=ax)

        # Set subplot title and labels for kernel density map
        ax.set_title(f'{variable}')
        ax.set_xlabel('Longitude')
        ax.set_ylabel('Latitude')

        # Adjust layout to prevent overlapping
        plt.tight_layout()

        # Save the plot to the output directory with the specified title
        plot_filename = os.path.join(output_directory, f'{variable}.png')
        plt.savefig(plot_filename)

        # Close the current plot to avoid overlapping when creating the next one
        plt.close()



In [22]:

## Kernel density maps for linear regression errors
kernel_density_maps_separate(lr_errors[columns_of_interest + coordinates], 'latitude', 'longitude', f'images/{map_name}/heatmaps/ape/lr/')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [23]:

## Kernel density maps for random forest errors
kernel_density_maps_separate(rf_errors[columns_of_interest + coordinates], 'latitude', 'longitude', f'images/{map_name}/heatmaps/ape/rf/')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [24]:
kernel_density_maps_separate(nn_errors[columns_of_interest + coordinates], 'latitude', 'longitude', f'images/{map_name}/heatmaps/ape/nn/')

Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [29]:

## Kernel density maps for original data
kernel_density_maps_separate(original_df[columns_of_interest + coordinates], 'latitude', 'longitude', f'images/{map_name}/heatmaps/original/')


Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [30]:
kernel_density_maps_separate(uk_interpolation[columns_of_interest + coordinates], 'latitude', 'longitude', f'images/{map_name}/heatmaps/uk/interpolation/')


Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


In [31]:

## Kernel density maps for interpolated data
kernel_density_maps_separate(nearest_neighbors_interpolation[columns_of_interest + coordinates], 'latitude', 'longitude', f'images/{map_name}/heatmaps/nearest_neighbors/interpolation/')


Variables to be plotted: ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'B', 'Cu', 'Fe', 'Mn', 'Zn', 'Mo', 'Ni', 'Al', 'Se', 'Si', 'Na', 'Va', 'DRIS_N', 'DRIS_P', 'DRIS_K', 'DRIS_Ca', 'DRIS_Mg', 'DRIS_S', 'DRIS_B', 'DRIS_Cu', 'DRIS_Fe', 'DRIS_Mn', 'DRIS_Zn', 'IMS', 'IBN']


# Boxplot

In [28]:
# def plot_two_normalized_boxplots(df1, df2, title1="Box Plot 1", title2="Box Plot 2"):
#     # Normalizar todas as variáveis para ambos os DataFrames
#     normalized_df1 = (df1 - df1.mean()) / df1.std()
#     normalized_df2 = (df2 - df2.mean()) / df2.std()

#     # Converter os DataFrames normalizados em um formato adequado para o box plot
#     normalized_df_melted1 = normalized_df1.melt(var_name='Variable', value_name='Normalized Value')
#     normalized_df_melted2 = normalized_df2.melt(var_name='Variable', value_name='Normalized Value')

#     # Criar subplots com Seaborn
#     fig, axes = plt.subplots(1, 2, figsize=(16, 6))

#     # Ajustar layout e destacar os títulos
#     fig.suptitle('Box Plots - Variáveis Normalizadas', fontsize=18, fontweight='bold')

#     # Criar box plot para o primeiro DataFrame
#     sns.boxplot(x='Variable', y='Normalized Value', data=normalized_df_melted1, color='#00bfff', width=0.8, ax=axes[0])
#     axes[0].set_title(title1)
#     axes[0].set_xlabel('')
#     axes[0].set_ylabel('Valor Normalizado', fontsize=14)
#     axes[0].tick_params(axis='x', rotation=90)

#     # Criar box plot para o segundo DataFrame
#     sns.boxplot(x='Variable', y='Normalized Value', data=normalized_df_melted2, color='#ff6347', width=0.8, ax=axes[1])
#     axes[1].set_title(title2)
#     axes[1].set_xlabel('')
#     axes[1].set_ylabel('Valor Normalizado', fontsize=14)
#     axes[1].tick_params(axis='x', rotation=90)

#     # Ajustar espaçamento
#     plt.tight_layout(rect=[0, 0, 1, 0.96])

#     # Mostrar os box plots
#     plt.show()



# Correlação

In [None]:
# from scipy.stats import zscore

# def correlation_matrix_heatmap(df):
#     # Normalizar os dados
#     normalized_df = df.apply(zscore)

#     # Calcular a matriz de correlação para os dados normalizados
#     correlation_matrix = normalized_df.corr()

#     # Configurar o estilo do seaborn
#     sns.set(style="white")

#     # Criar um objeto Figure e um eixo (axes) para o plot
#     plt.figure(figsize=(30, 30))
#     ax = plt.axes()

#     # Criar uma máscara para remover a diagonal superior
#     mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

#     # Criar o heatmap usando Seaborn
#     sns.heatmap(correlation_matrix, annot=True, cmap="viridis", fmt=".2f", linewidths=.5, mask=mask, ax=ax)

#     # Configurar título
#     plt.title("Heatmap da Matriz de Correlação entre Variáveis", fontsize=35)

#     # Exibir o plot
#     plt.show()


In [None]:
# %pip install python-pptx