# Expansao de testes
Esse notebook tem como objetivo:
- gerar uma lista de pontos (séries sst) a serem analisados pelos benchmarks e SVR
- comparar os resultados com estatísticas das séries

## Intro

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import netCDF4 as nc
import xarray as xr
import netCDF4 as nc
from mpl_toolkits.basemap import Basemap
from mpl_toolkits.basemap import shiftgrid
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
from tqdm import tqdm


In [4]:
from netuno import SSTHelper, SubserieDTW

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [5]:
fp = '../dados/sst.mnmean.nc'

In [6]:
ds = SSTHelper.load_dataset(fp)
df = SSTHelper.load_dataframe(ds)

In [7]:
split_date = '2021-12-01'

## Geracao de Pontos
### Checagem
Precisamos chegar se um ponto aleatório gerado é válido

In [8]:
def check_valid(df, lat, lon):
    ts = SSTHelper.get_sst_series(df, lat, lon).sst.to_list()
    if len(ts) == 0 or pd.isna(ts[0]):
        return False
    return True

In [9]:
check_valid(df, 0, -72) # o valor retornado deve ser falso

False

In [10]:
check_valid(df, -22, -72) # o valor retornado deve ser verdadeiro

True

In [11]:
def generate_even():
    return randint(-90, 90) * 2

### Geraçao de pontos

In [12]:
list_points = set()
list_not_points = set()
n_points = 0

while n_points < 100:
    lat = generate_even() 
    lon = generate_even()
    while (lat, lon) in list_not_points or (lat, lon) in list_points or not check_valid(df, lat, lon):
        list_not_points.add((lat, lon))
        lat = generate_even()
        lon = generate_even()
    list_points.add((lat, lon))
    n_points += 1

In [13]:
invalid_count = 0
for point in list_points:
    if not check_valid(df, point[0], point[1]):
        invalid_count += 1

invalid_count

0

In [14]:
list_points = list(list_points)
check_valid(df, list_points[0][0], list_points[0][1])

True

In [15]:
points_df = pd.DataFrame(list_points)

In [16]:
points_df.rename({0: 'lat', 1: 'lon'}, axis=1).to_csv('../dados/pontos.csv')

In [17]:
points_df

Unnamed: 0,0,1
0,-50,84
1,30,174
2,88,8
3,-66,32
4,-68,-20
...,...,...
95,-64,46
96,-12,-90
97,-2,158
98,34,172


### Reler os pontos salvos em csv

In [18]:
def read_list_points(filename):
    list_points = []
    with open(filename, 'r') as file:
        for line in file:
            line_split = line.split(',')
            try:
                list_points.append((int(line_split[1]), int(line_split[2])))
            except ValueError:
                pass
    return list_points

In [19]:
def read_results(filename):
    list_rmse = []
    list_mape = []
    with open(filename, 'r') as file:
        for line in file:
            line_split = line.split(',')
            try:
                list_rmse.append(float(line_split[1]))
                list_mape.append(float(line_split[2]))
            except ValueError:
                pass
    return [list_rmse, list_mape]

In [20]:
list_points = read_list_points('../dados/pontos.csv')

In [21]:
check_valid(df, list_points[35][0], list_points[35][1])

True

# Análise dos pontos recolhidos

## Média das métricas de erro

In [29]:
measure_list = {
    'Approximate Entropy': list_approximate_entropy,
    'Benford Correlation': list_benford_correlation,
    'Bin Entropy': list_bin_entropy,
    'Standard Dev': list_std_deviation
}

error_metrics = {
    'RMSE': 0, 
    'MAPE': 1
}

method_list = {
    # 'SARIMA': sarima_results
    'SVR': svr_results,
}

In [30]:
svr_results = read_results('../dados/svr_results.csv')
# sarima_results = read_results('../dados/sarima_results.csv')
svr_results[0][:5]

[1.8182888545611833,
 1.267544047783792,
 0.8871647066079104,
 0.06814168190305141,
 0.8756034207514847]

### Resultado da média das métricas de erro

In [38]:
for method in method_list:
    print(f"======{method}======")
    for metric in error_metrics:
        print(f"Mean {metric}: {np.mean(method_list[method][error_metrics[metric]])}")

Mean RMSE: 0.6264165226863616
Mean MAPE: 0.11248874207240384


## Análise dos pontos utilizando características selecionadas

Vamos utilizar a biblioteca tsfresh para calcular estatísticas dos pontos recolhidos.
As seguintes características serao consideradas:
- Entropia aproximada
- Dickey Fuller
- Correlacao de Benford
- Entropia em Bins
- Desvio Padrao


In [22]:
from tsfresh.feature_extraction import feature_calculators
from scipy.stats import pearsonr

In [23]:
list_approximate_entropy = []
list_dickey_fuller = []
list_benford_correlation = []
list_bin_entropy = []
list_std_deviation = []

for point in tqdm(list_points):
    ts = SSTHelper.get_sst_series(df, point[0], point[1]).sst.to_numpy()
    list_approximate_entropy.append(feature_calculators.approximate_entropy(ts, 2, 3))
    # list_dickey_fuller.append(feature_calculators.augmented_dickey_fuller())
    list_benford_correlation.append(feature_calculators.benford_correlation(ts))
    list_bin_entropy.append(feature_calculators.binned_entropy(ts, 12))
    list_std_deviation.append(feature_calculators.standard_deviation(ts))

  0%|          | 0/100 [00:00<?, ?it/s]

 12%|█▏        | 12/100 [00:08<01:00,  1.44it/s]

In [24]:
list_approximate_entropy

[0.014112886818138763,
 0.007971934298443388,
 0.004236244276716089,
 0.03719259914990307,
 0.06872644348650409,
 0.015578635029948745,
 0.0047509081025102606,
 0.013123363974808677,
 0.020952415837208506,
 0.011424320266474497,
 0.017714929485714258,
 0.014393590676349662,
 0.01838124380095172,
 0.04295707690097658,
 0.01852886327637708,
 0.09445157887627406,
 0.016972597365026848,
 0.019574497065578553,
 0.015201681686188698,
 0.009777450135520917,
 0.016869349716802727,
 0.04540094027663073,
 0.017443151890342477,
 0.009463622047693472,
 0.010726673038203065,
 0.021806650283604623,
 0.02554624065924163,
 0.09646643762626203,
 0.0909790032397697,
 0.01602414486354352,
 0.019179399464398852,
 0.018094423229008165,
 0.01073091571700532,
 0.016617262719807263,
 0.01845313497855955,
 0.01214350832072171,
 0.010701043695981821,
 0.01830239783034284,
 0.01297894487946804,
 0.01576099401654403,
 0.014232375914220823,
 0.014428411665719815,
 0.10790594363450373,
 0.010595941282528023,
 0.015

### Resultado de Correlaçao de Pearson com características selecionadas 

In [40]:
for method in method_list:
    print(f"======{method}======")
    for measure in measure_list:
        print(measure)
        for metric in error_metrics:
            pearson, pvalue = pearsonr(method_list[method][error_metrics[metric]], measure_list[measure])
            print(f"(VS {metric}) Obtained {pearson:.3f} with p-value {pvalue:.3f}")

Approximate Entropy
(VS RMSE) Obtained -0.010 with p-value 0.918
(VS MAPE) Obtained 0.040 with p-value 0.693
Benford Correlation
(VS RMSE) Obtained -0.038 with p-value 0.704
(VS MAPE) Obtained 0.087 with p-value 0.392
Bin Entropy
(VS RMSE) Obtained -0.037 with p-value 0.713
(VS MAPE) Obtained -0.250 with p-value 0.012
Standard Dev
(VS RMSE) Obtained -0.081 with p-value 0.422
(VS MAPE) Obtained -0.130 with p-value 0.197


## Análise dos pontos utilizando tsfresh

In [41]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [42]:
extraction_settings = ComprehensiveFCParameters()
list_features = []

for point in list_points:
    ts = pd.DataFrame(SSTHelper.get_sst_series(df, point[0], point[1]).sst)
    ts['id'] = 1
    X = extract_features(ts, 
                        column_id='id',
                        default_fc_parameters=extraction_settings,
                        # we impute = remove all NaN features automatically
                        impute_function=impute, disable_progressbar=True)
    list_features.append(X.to_numpy()[0])

In [None]:
feature_names = list(X.columns)

In [None]:
n_features = len(list_features[0])

In [None]:
list_features = np.array(list_features).T
assert len(list_features) == n_features

In [None]:
dict_correlations = {}

In [None]:
for metric in error_metrics:
    metric_correlations = []
    metric_error_list = svr_results[error_metrics[metric]]
    for features in list_features:
        corr = pearsonr(features, metric_error_list)
        metric_correlations.append(corr)
    dict_correlations[metric] = metric_correlations



In [None]:
df_correlations = pd.DataFrame(dict_correlations)

In [None]:
df_correlations.index = feature_names
df_correlations

Unnamed: 0,RMSE,MAPE
sst__variance_larger_than_standard_deviation,"(0.44731979994315746, 3.0729881762891283e-06)","(-0.0702630374779277, 0.48727103505926067)"
sst__has_duplicate_max,"(-0.2237266202878018, 0.02524832714243999)","(-0.05919774523078725, 0.5585152661851898)"
sst__has_duplicate_min,"(-0.19308001819786774, 0.05426972831146174)","(0.4686162851521978, 8.78286928239777e-07)"
sst__has_duplicate,"(-0.06041214486123031, 0.5504593942983133)","(0.24586401758492657, 0.013674486852831232)"
sst__sum_values,"(0.26743164161825345, 0.007147912651909794)","(-0.3890989849187379, 6.30946593973488e-05)"
...,...,...
sst__permutation_entropy__dimension_5__tau_1,"(0.3672714587770922, 0.00017073852592701697)","(-0.21833931785632546, 0.029084073610760968)"
sst__permutation_entropy__dimension_6__tau_1,"(0.36822982942794863, 0.00016367568314930861)","(-0.22058485581835238, 0.027429100806286678)"
sst__permutation_entropy__dimension_7__tau_1,"(0.36645049694987414, 0.00017701204284872207)","(-0.22170067461413456, 0.026636938860558085)"
sst__query_similarity_count__query_None__threshold_0.0,"(nan, nan)","(nan, nan)"


In [None]:
df_correlations.sort_values(by='MAPE')

NameError: name 'df_correlations' is not defined