## Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff 
from ordpy import complexity_entropy

import functions
import export_creation

## Functions definition

In [2]:
def main(param, DATA_NAME, PLOT, values, labels, real_cp):
    """
    Given a dataset, with it's respective values and labels, creates a new representation for the data, using SAX transformation. After that,
    groups close symbols in the series, according to a given parameter.

    -----------
    Parameters:
    param (tuple): (number of bins, window_size)
    DATA_NAME (string): name of the folder in which to put the metrics and figures for the dataset.
    PLOT (bool): indicates if a histogram of the symbols distribution is to be plotted.
    values (pd.Dataframe): values of the dataset.
    labels (pd.Series): labels of the dataset.

    -----------
    Returns:
    new_data (pd.Dataframe): new representation for the data values.
    """
    classes = labels.unique()

    print('Transformação SAX')
    # perform SAX transformation
    sax_values = functions.run_sax(values, n_bins=param[0])
    symbols_dict = functions.compute_symbols_dictionary(np.unique(sax_values), window_size=param[1])
    sax_data = pd.DataFrame(sax_values.reshape(values.shape))

    print('---------------\nNova Representação')
    # create new representation
    new_rep = sax_data.apply(lambda row : functions.create_new_representation(row, window_size=param[1], dict=symbols_dict), axis=1)

    print('---------------\nJensen Shannon')
    # calculate jensenshannon distance based on the new representation
    js_distance = functions.calculate_js_distance(new_rep, labels)
    print(js_distance)
    eq_class = js_distance[js_distance[['Class 1', 'Class 2']].nunique(axis=1) == 1].reset_index(drop=True)['JS Distance']
    diff_class = js_distance[js_distance[['Class 1', 'Class 2']].nunique(axis=1) > 1].reset_index(drop=True)['JS Distance']
    eq_class = eq_class.apply(pd.Series).stack().reset_index(drop=True)
    diff_class = diff_class.apply(pd.Series).stack().reset_index(drop=True)
    export_creation.save_js_metrics(eq_class, diff_class, DATA_NAME, param[1], param[0])

    print('---------------\nEntropia x Complexidade Estatística')
    # calculate entropy and statistical complexity of the data
    comp_entrop = [complexity_entropy(new_rep[i]) for i in range(new_data.shape[0])]
    comp_entrop = pd.DataFrame(comp_entrop, columns=['entropy', 'statistical_complexity'])
    export_creation.plot_entropy_sc(comp_entrop, labels, DATA_NAME, param[1], param[0])

In [3]:
def load_data(DATA_NAME):
    """
    Load a dataset.
    """
    data = arff.loadarff(f'../data/{DATA_NAME}.arff')
    data = pd.DataFrame(data[0])
    labels = data['target']
    values = data.drop('target', axis=1)
    return values, labels

## Run databases

### Time Series Classification

In [4]:
DATA_NAME1 = 'AbnormalHeartbeat/AbnormalHeartbeat_TRAIN'
DATA_NAME2 = 'AbnormalHeartbeat/AbnormalHeartbeat_TEST'
PLOT_NAME = 'AbnormalHeartbeat'

In [None]:
# DATA_NAME1 = 'ArticularyWordRecognition/ArticularyWordRecognition_TRAIN'
# DATA_NAME2 = 'ArticularyWordRecognition/ArticularyWordRecognition_TEST'
# PLOT_NAME = 'ArticularyWordRecognition'

In [None]:
# DATA_NAME1 = 'Car/Car_TEST'
# DATA_NAME2 = 'Car/Car_TRAIN'
# PLOT_NAME = 'Car'

In [None]:
# DATA_NAME1 = 'ChlorineConcentration/ChlorineConcentration_TRAIN'
# DATA_NAME2 = 'ChlorineConcentration/ChlorineConcentration_TEST'
# PLOT_NAME = 'ChlorineConcentration'

In [None]:
# DATA_NAME1 = 'ACSF1/ACSF1_TEST'
# DATA_NAME2= 'ACSF1/ACSF1_TRAIN'
# PLOT_NAME = 'ACSF1'

In [None]:
# DATA_NAME1 = 'SyntheticControl/SyntheticControl_TRAIN' 
# DATA_NAME2 = 'SyntheticControl/SyntheticControl_TEST' 
# PLOT_NAME = 'SyntheticControl'

In [None]:
# DATA_NAME1 = 'TwoPatterns/TwoPatterns_TRAIN'
# DATA_NAME2 = 'TwoPatterns/TwoPatterns_TEST'
# PLOT_NAME = 'TwoPatterns'

In [None]:
# DATA_NAME1 = 'BeetleFly/BeetleFly_TRAIN'
# DATA_NAME2 = 'BeetleFly/BeetleFly_TEST'
# PLOT_NAME = 'BeetleFly'

In [None]:
# DATA_NAME1 = 'BirdChicken/BirdChicken_TRAIN'
# DATA_NAME2 = 'BirdChicken/BirdChicken_TEST'
# PLOT_NAME = 'BirdChicken'

### HASC

In [None]:
DATA_NAME1 = DATA_NAME2 = ''
PLOT_NAME = 'HASC'

dados_info = "dados_2"
data_group_name = "person101"
database_name = "HASC1002"
colnames = ["x", "y", "z"]

df = pd.read_parquet("../data/01_og_HASC/{}/{}/{}_concat.parquet".format(dados_info, data_group_name, database_name))
df_label = pd.read_csv("../data/01_og_HASC/{}/{}/{}.label".format(dados_info, data_group_name, database_name), header=None)
df_label.columns = ["timestamp_start", "timestamp_end", "classe"]
df.reset_index(drop=True, inplace=True)

for tmps in df_label["timestamp_end"][:-1]:
    print(len(df[df["timestamp"] >= tmps]["timestamp"]))
    
real_cp = []
for tmp_start, tmp_end in zip(df_label["timestamp_start"],df_label["timestamp_end"]) :
    print(tmps)
    min_tmp = max(df[(df["timestamp"] >= tmp_start) & (df["timestamp"] <= tmp_end)]["timestamp"])
    index_ch = df[df["timestamp"] == min_tmp].index.values[0]+1
    real_cp.append(index_ch)

In [None]:
df

In [None]:
real_cp = [cp - 890 for cp in real_cp]
real_cp

In [None]:
values = df.drop(columns='timestamp').apply(lambda x:np.linalg.norm(x.values), axis=1)
values = values.drop(index=[i for i in range(890)]).reset_index(drop=True)
values = values.drop(index=[i for i in range(real_cp[-1], len(values))]).reset_index(drop=True)

In [None]:
values

In [None]:
df_label

In [None]:
labels = df_label.drop(index=11)['classe']
labels

## Execution

In [8]:
# Parameters

PARAM = [(3, 3), (3, 4), (3, 5), (4, 3), (4, 4), (4, 5)] # (n_bins, window_size)
# PARAM = [(5, 3), (5, 4), (5, 5)]
param = (3, 3)
PLOT = False
HASC = False

In [9]:
if not HASC:
    values, labels = load_data(DATA_NAME1)

    if DATA_NAME2:
        values1, labels1 = load_data(DATA_NAME2)
        values = pd.concat([values, values1], ignore_index=True)
        labels = pd.concat([labels, labels1], ignore_index=True)

In [7]:
for param in PARAM:
    print()
    print(f'n_bins: {param[0]}, window_size: {param[1]}')
    main(param, PLOT_NAME, PLOT, values, labels, real_cp)


n_bins: 3, window_size: 3


NameError: name 'values' is not defined

In [10]:
values.values.reshape(-1, 1)

array([[-0.02087402],
       [-0.04519653],
       [-0.04394531],
       ...,
       [-0.04534912],
       [-0.0453186 ],
       [-0.04568481]])

In [11]:
# perform SAX transformation
sax_values = functions.run_sax(values, n_bins=param[0])
symbols_dict = functions.compute_symbols_dictionary(np.unique(sax_values), window_size=param[1])
sax_data = pd.DataFrame(sax_values.reshape(values.shape))

In [12]:
sax_values

array(['b', 'a', 'a', ..., 'a', 'a', 'a'], dtype='<U1')

In [13]:
sax_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18520,18521,18522,18523,18524,18525,18526,18527,18528,18529
0,b,a,a,a,a,a,a,a,a,b,...,a,a,a,a,a,a,a,a,a,a
1,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,b
2,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,c,c,c,c
3,b,b,b,b,b,b,b,b,b,b,...,c,c,c,c,c,c,c,c,b,c
4,b,a,a,a,a,a,a,a,a,a,...,a,a,a,a,a,a,a,a,a,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,b,b,b,b,b,b,b,b,b,b,...,b,b,b,b,b,b,b,b,b,b
405,b,b,b,b,b,b,b,b,b,b,...,a,a,a,a,a,a,a,a,a,a
406,b,c,c,c,c,c,c,b,a,a,...,c,c,c,c,c,c,c,c,c,c
407,b,b,c,c,c,c,c,b,b,b,...,b,b,b,b,b,b,b,b,b,b


In [18]:
new_rep = sax_data.apply(lambda row : functions.create_new_representation(row, window_size=param[1], dict=symbols_dict), axis=1)
new_rep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18518,18519,18520,18521,18522,18523,18524,18525,18526,18527
0,15,2,2,2,2,2,2,3,7,20,...,2,2,2,2,2,2,2,2,2,2
1,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
2,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,21,25,38,38
3,20,20,20,20,20,20,20,20,20,20,...,38,38,38,38,38,38,38,38,37,34
4,15,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
404,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
405,20,20,20,20,20,20,20,20,20,19,...,2,2,2,2,2,2,2,2,2,2
406,25,38,38,38,38,37,32,15,2,2,...,38,38,38,38,38,38,38,38,38,38
407,21,25,38,38,38,37,33,20,20,20,...,20,20,20,20,20,20,20,20,20,20


In [19]:
new_data = []
for index, row in sax_data.iterrows():
    # create new representation for each instance
    new_representation = functions.create_new_representation(row, window_size=param[1], dict=symbols_dict)
    new_data.append(new_representation)
new_data = np.array(new_data)

In [None]:
new_data = []
for i in range(1, len(real_cp)):
    # create new representation for each instance
    series = sax_data.iloc[real_cp[i - 1] : real_cp[i] + 1].T.squeeze(axis=0)
    new_representation = functions.create_new_representation(series, window_size=param[1], dict=symbols_dict)
    new_data.append(new_representation)
new_data = np.array(new_data)

In [None]:
new_data.shape