# Talhão 1

In [3]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasRegressor
import pandas as pd
import numpy as np
from keras.optimizers import Adam, SGD, RMSprop
from keras.metrics import MeanAbsolutePercentageError, RootMeanSquaredError, MeanSquaredError, MeanAbsoluteError
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.metrics import MeanAbsoluteError
from keras.metrics import MeanAbsolutePercentageError


def divide_into_parts(df, lat_col, lon_col, N):
    """
    Divide a DataFrame into N parts based on latitude and longitude.

    Args:
        df (pd.DataFrame): DataFrame to be divided.
        lat_col (str): Column name for latitude.
        lon_col (str): Column name for longitude.
        N (int): Number of parts to divide the DataFrame into.

    Returns:
        list: List of DataFrames representing each part.
    """
    # Check if N is a valid positive integer
    if N <= 0:
        raise ValueError("N must be a positive integer.")

    # Sort the DataFrame based on latitude and longitude columns
    df_sorted = df.sort_values(by=[lat_col, lon_col])

    # Calculate the number of samples in each part
    num_samples = len(df_sorted)
    samples_per_part = num_samples // N
    remainder = num_samples % N

    parts = []

    # Divide the DataFrame into N parts
    start_idx = 0
    for i in range(N):
        end_idx = start_idx + samples_per_part + (1 if i < remainder else 0)
        current_part = df_sorted.iloc[start_idx:end_idx]
        parts.append(current_part)
        start_idx = end_idx

    return parts

def combine_parts(part_list):
    """
    Combine a list of DataFrames into a single DataFrame.

    Args:
        quadrant_list (list): List of DataFrames to be combined.

    Returns:
        pd.DataFrame: Combined DataFrame.
    """
    # Check if at least one DataFrame is provided
    if not part_list or len(part_list) < 2:
        raise ValueError("At least two DataFrames are required for combination.")

    # Combine all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(part_list, ignore_index=True)

    return combined_df


def kfold_ape(X, Y, model_type='linear', k=4, random_state=42):
    """
    Perform k-fold Absolute Percentage Error (APE) evaluation for regression models.

    Args:
        X (pd.DataFrame): Feature DataFrame.
        Y (pd.DataFrame): Target DataFrame.
        model_type (str): Type of regression model ('linear', 'random_forest', or 'keras').
        k (int): Number of folds for k-fold cross-validation.
        random_state (int): Random seed for reproducibility.

    Returns:
        pd.DataFrame: DataFrame containing APE values for each target column and each fold.
    """
    # Initialize lists to store results
    target_names = []
    ape_values = []

    # Normalize the data using StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Apply PCA with 5 components
    pca = PCA(n_components=5)
    X_pca = pca.fit_transform(X_scaled)

    # Initialize KFold
    kf = KFold(n_splits=k, shuffle=False)

    # Initialize the final DataFrame to store APEs for all folds
    result_df = pd.DataFrame()

    # Choose the training model
    if model_type == 'keras':
        input_neurons = X_pca.shape[1]  # Number of features after PCA
        output_neurons = Y.shape[1]  # Number of target columns
        model = build_simple_neural_network(input_neurons, output_neurons)

        # Iterate over folds
        for train_index, test_index in kf.split(X_pca):
            X_train, X_test = X_pca[train_index], X_pca[test_index]
            y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]

            # Train the model
            model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=0)

            # Prediction on the test set
            y_test_pred = model.predict(X_test)

            # Calculate APE for each point in the test set
            fold_errors = np.abs(y_test - y_test_pred) / np.abs(y_test)

            # Concatenate errors for all targets
            if result_df.empty:
                result_df = pd.DataFrame(fold_errors, columns=Y.columns)
            else:
                result_df = pd.concat([result_df, pd.DataFrame(fold_errors, columns=Y.columns)], ignore_index=True)

    elif model_type == 'random_forest':
        model = RandomForestRegressor(random_state=random_state)

        # Iterate over target columns (Y)
        for target_column in Y.columns:
            target_errors = []  # To store errors for each fold

            # Iterate over folds
            for train_index, test_index in kf.split(X_pca):
                X_train, X_test = X_pca[train_index], X_pca[test_index]
                y_train, y_test = Y.iloc[train_index][target_column], Y.iloc[test_index][target_column]

                # Train the model
                model.fit(X_train, y_train)

                # Prediction on the test set
                y_test_pred = model.predict(X_test)

                # Calculate APE for each point in the test set
                fold_errors = abs(y_test - y_test_pred) / abs(y_test)
                target_errors.extend(fold_errors)

            # Add target column and errors for this column to the final DataFrame
            result_df[target_column] = target_errors

    elif model_type == 'linear':
        model = LinearRegression()

        # Iterate over target columns (Y)
        for target_column in Y.columns:
            target_errors = []  # To store errors for each fold

            # Iterate over folds
            for train_index, test_index in kf.split(X_pca):
                X_train, X_test = X_pca[train_index], X_pca[test_index]
                y_train, y_test = Y.iloc[train_index][target_column], Y.iloc[test_index][target_column]

                # Train the model
                model.fit(X_train, y_train)

                # Prediction on the test set
                y_test_pred = model.predict(X_test)

                # Calculate APE for each point in the test set
                fold_errors = abs(y_test - y_test_pred) / abs(y_test)
                target_errors.extend(fold_errors)

            # Add target column and errors for this column to the final DataFrame
            result_df[target_column] = target_errors

    return result_df

def build_simple_neural_network(input_neurons, output_neurons, optimizer='adam', loss='mse'):
    model = Sequential()
    
    # Adicionando a camada de entrada
    model.add(Dense(input_neurons, input_dim=input_neurons, activation='relu'))
    
    # Adicionando a camada oculta com o mesmo número de neurônios da camada de entrada
    model.add(Dense(input_neurons, activation='relu'))
    
    # Adicionando a camada de saída
    model.add(Dense(output_neurons, activation='linear'))  # Utilize 'linear' se for um problema de regressão
    
    # Compilando o modelo
    model.compile(optimizer=optimizer, loss=loss, metrics=[MeanSquaredError()])
    
    return model    


In [4]:

# load data
map1_df = pd.read_csv('data/map1/interpolation/nearest_neighbors_interpolation_df.csv')
slices = divide_into_parts(map1_df, 'latitude', 'longitude', 10)
combined_slices = combine_parts(slices)

X = combined_slices.iloc[:, :15]
X = X.drop(['latitude', 'longitude'], axis=1)
y = combined_slices.iloc[:, 15:]

# Perform k-fold APE evaluation for Linear Regression, Random Forest and Keras
map1_linear_regression_error = kfold_ape(X, y, model_type='linear', k=10)
map1_random_forest_error = kfold_ape(X, y, model_type='random_forest', k=10)
map1_keras_error = kfold_ape(X, y, model_type='keras', k=10)

# Save the results to CSV files
map1_linear_regression_error.to_csv('data/map1/errors/linear_regression_error.csv')
map1_random_forest_error.to_csv('data/map1/errors/random_forest_error.csv')
map1_keras_error.to_csv('data/map1/errors/keras_error.csv')







# Talhão 2

In [5]:
map2_df = pd.read_csv('data/map2/interpolation/nearest_neighbors_interpolation_df.csv')
slices = divide_into_parts(map2_df, 'latitude', 'longitude', 10)
combined_slices = combine_parts(slices)

X = combined_slices.iloc[:, :14]
X = X.drop(['latitude', 'longitude'], axis=1)
y = combined_slices.iloc[:, 14:]

# Perform k-fold APE evaluation for Linear Regression, Random Forest and Keras
map2_linear_regression_error = kfold_ape(X, y, model_type='linear', k=10)
map2_random_forest_error = kfold_ape(X, y, model_type='random_forest', k=10)
map2_keras_error = kfold_ape(X, y, model_type='keras', k=10)

# Save the results to CSV files
map2_linear_regression_error.to_csv('data/map2/errors/linear_regression_error.csv')
map2_random_forest_error.to_csv('data/map2/errors/random_forest_error.csv')
map2_keras_error.to_csv('data/map2/errors/keras_error.csv')


