## Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.io import arff 
from ordpy import complexity_entropy

In [2]:
import functions
import load_hasc
import export_creation

2023-08-18 14:25:06.179173: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-18 14:25:06.295280: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-18 14:25:06.298900: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier  
from sklearn import metrics

## Functions definition

In [4]:
def transf_analysis(param, DATA_NAME, PLOT, values, labels):
    """
    Given a dataset, with it's respective values and labels, creates a new representation for the data, using SAX transformation. After that,
    groups close symbols in the series, according to a given parameter.

    -----------
    Parameters:
    param (tuple): (number of bins, window_size)
    DATA_NAME (string): name of the folder in which to put the metrics and figures for the dataset.
    PLOT (bool): indicates if a histogram of the symbols distribution is to be plotted.
    values (pd.Dataframe): values of the dataset.
    labels (pd.Series): labels of the dataset.

    -----------
    Returns:
    new_rep (pd.Dataframe): new representation for the data values.
    """

    # Perform SAX transformation
    sax_values = functions.run_sax(values, n_bins=param[0])
    sax_data = pd.DataFrame(sax_values.reshape(values.shape))

    # Compute dictionary of symbols
    symbols_dict = functions.compute_symbols_dictionary(np.unique(sax_values), window_size=param[1])

    # Create new representation using sliding windows
    new_rep = sax_data.apply(lambda row : functions.create_new_representation(row, window_size=param[1], dict=symbols_dict), axis=1)

    # Calculate jensenshannon distance based on the new representation
    pairwise_js = functions.calculate_js_distance(new_rep)
    eq_class, diff_class = functions.get_js_by_class(pairwise_js, labels)
    export_creation.save_js_metrics(eq_class, diff_class, DATA_NAME, param[1], param[0])

    # Calculate entropy and statistical complexity of the data
    comp_entrop = [complexity_entropy(new_rep[i]) for i in range(new_rep.shape[0])]
    comp_entrop = pd.DataFrame(comp_entrop, columns=['entropy', 'statistical_complexity'])
    export_creation.plot_entropy_sc(comp_entrop, labels, DATA_NAME, param[1], param[0])

    return new_rep

In [5]:
def load_data(DATA_NAME):
    """
    Load a dataset.
    """
    data = arff.loadarff(f'../data/{DATA_NAME}.arff')
    data = pd.DataFrame(data[0])
    labels = data['target']
    values = data.drop('target', axis=1)
    return values, labels

## Load databases

### Time Series Classification

In [None]:
DATA_NAME1 = 'AbnormalHeartbeat/AbnormalHeartbeat_TRAIN'
DATA_NAME2 = 'AbnormalHeartbeat/AbnormalHeartbeat_TEST'
PLOT_NAME = 'AbnormalHeartbeat'

In [None]:
# DATA_NAME1 = 'ArticularyWordRecognition/ArticularyWordRecognition_TRAIN'
# DATA_NAME2 = 'ArticularyWordRecognition/ArticularyWordRecognition_TEST'
# PLOT_NAME = 'ArticularyWordRecognition'

In [None]:
# DATA_NAME1 = 'Car/Car_TEST'
# DATA_NAME2 = 'Car/Car_TRAIN'
# PLOT_NAME = 'Car'

In [None]:
# DATA_NAME1 = 'ChlorineConcentration/ChlorineConcentration_TRAIN'
# DATA_NAME2 = 'ChlorineConcentration/ChlorineConcentration_TEST'
# PLOT_NAME = 'ChlorineConcentration'

In [None]:
# DATA_NAME1 = 'ACSF1/ACSF1_TEST'
# DATA_NAME2= 'ACSF1/ACSF1_TRAIN'
# PLOT_NAME = 'ACSF1'

In [None]:
# DATA_NAME1 = 'SyntheticControl/SyntheticControl_TRAIN' 
# DATA_NAME2 = 'SyntheticControl/SyntheticControl_TEST' 
# PLOT_NAME = 'SyntheticControl'

In [None]:
# DATA_NAME1 = 'TwoPatterns/TwoPatterns_TRAIN'
# DATA_NAME2 = 'TwoPatterns/TwoPatterns_TEST'
# PLOT_NAME = 'TwoPatterns'

In [None]:
# DATA_NAME1 = 'BeetleFly/BeetleFly_TRAIN'
# DATA_NAME2 = 'BeetleFly/BeetleFly_TEST'
# PLOT_NAME = 'BeetleFly'

In [None]:
# DATA_NAME1 = 'BirdChicken/BirdChicken_TRAIN'
# DATA_NAME2 = 'BirdChicken/BirdChicken_TEST'
# PLOT_NAME = 'BirdChicken'

In [None]:
data, labels = load_data(DATA_NAME1)

if DATA_NAME2:
    data1, labels1 = load_data(DATA_NAME2)
    data = pd.concat([data, data1], ignore_index=True)
    labels = pd.concat([labels, labels1], ignore_index=True)

### HASC

In [6]:
import os

PLOT_NAME = 'HASC'

SEQUENCE_LEN = 600
OVERLAP = 0.2
colnames = ['timestamp', 'x', 'y', 'z']
data_group_name = "person101"
directories = os.listdir('../data/HASC')

dfs = []
df_label = []

for dir in directories:
    files = os.listdir('../data/HASC/{}/{}'.format(dir, data_group_name))
    samples = 0
    for file in files:
        if file.endswith('.csv'):
            samples += 1
            series = pd.read_csv('../data/HASC/{}/{}/{}'.format(dir, data_group_name, file))
            series.columns = colnames
            series = series.drop(columns='timestamp').apply(lambda x:np.linalg.norm(x.values), axis=1)
            dfs.append(series)
    df_label += [dir] * samples

df = pd.concat(dfs, axis=1).T
df.reset_index(drop=True, inplace=True)

df_label = pd.Series(df_label)

In [7]:
data, labels = load_hasc.get_samples_from_time_series(df, df_label, SEQUENCE_LEN, OVERLAP)

2023-08-18 14:25:21.111443: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-18 14:25:21.119421: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1960] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [8]:
# Remove NaN

na_free = data.dropna()
only_na = data[~data.index.isin(na_free.index)]
labels.drop(only_na.index, inplace=True)
data = na_free

In [9]:
labels.reset_index(drop=True, inplace=True)
data.reset_index(drop=True, inplace=True)

## Parameters

In [10]:
# Parameters

PARAM = [(3, 3), (3, 4), (3, 5), (4, 3), (4, 4), (4, 5), (5, 3), (5, 4), (5, 5)] # (n_bins_sax, window_size)
param = (4, 4)
PLOT_DIST = False
PLOT_SERIES = False
HASC = True

## Run Transformation Analysis

In [None]:
data.info()

In [None]:
# Plot the Series

if PLOT_SERIES:
    import matplotlib.pyplot as plt
    from pathlib import Path

    for label in np.unique(labels):
        class_data = data[labels == label][:5]

        for i in range(len(class_data)):
            plt.plot(class_data.iloc[i])
            plt.title(f'Series from {label}')
            Path(f'../fig/whole_series/{PLOT_NAME}/{str(label)}').mkdir(parents=True, exist_ok=True)
            plt.savefig(f'../fig/whole_series/{PLOT_NAME}/{str(label)}/fig{str(i)}.png')
            plt.close()

In [None]:
# Run transformations 

for param in PARAM:
    print()
    print(f'n_bins: {param[0]}, window_size: {param[1]}')
    transf_analysis(param, PLOT_NAME, PLOT_DIST, data, labels)

## Run Classifier

### Changing data representation

In [None]:
# Perform SAX transformation

sax_values = functions.run_sax(data, n_bins=param[0])
symbols_dict = functions.compute_symbols_dictionary(np.unique(sax_values), window_size=param[1])
sax_data = pd.DataFrame(sax_values.reshape(data.shape))

In [None]:
# Create new representation using sliding window

X = sax_data.apply(lambda row : functions.create_new_representation(row, window_size=param[1], dict=symbols_dict), axis=1)

In [None]:
# Save new representation to external file

X.to_csv('./new_rep.csv', index=None)

# Read new representation from external file

# X = pd.read_csv('./new_rep.csv', index_col=False)

In [None]:
data.info()

In [None]:
X.info()

In [None]:
# Splitting modified data into train, validation and test input

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

In [None]:
# Splitting original unmodified data into train, validation and test input

data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.2, random_state=42)
data_train, data_val, label_train, label_val = train_test_split(data_train, label_train, test_size=0.25, random_state=42)

### K-Neighbors

In [None]:
# Train KNeighbors model

neigh = KNeighborsClassifier()
neigh.fit(X_train, y_train)

In [None]:
# Predicting labels for the test set

y_pred = neigh.predict(X_test)

In [None]:
# Calculate confusion matrix for predicted labels

metrics.confusion_matrix(y_test, y_pred)

In [None]:
# Doing the same process for the original unmodified data

neigh.fit(data_train, label_train)
label_pred = neigh.predict(data_test)

In [None]:
# Calculate accuracy score for the test set

acc_score = metrics.balanced_accuracy_score(y_test, y_pred)
original_acc_score = metrics.balanced_accuracy_score(label_test, label_pred)

print('With the transformation:', acc_score)
print('Without the transformation:', original_acc_score)

In [11]:
# Test the classifier for multiple parameters combinations

def test_knn(data, labels, window_size, n_bins):
    sax_values = functions.run_sax(data, n_bins=param[0])
    symbols_dict = functions.compute_symbols_dictionary(np.unique(sax_values), window_size=param[1])
    sax_data = pd.DataFrame(sax_values.reshape(data.shape))
    X = sax_data.apply(lambda row : functions.create_new_representation(row, window_size, dict=symbols_dict), axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.2, random_state=42)
    data_train, data_val, label_train, label_val = train_test_split(data_train, label_train, test_size=0.25, random_state=42)
    neigh = KNeighborsClassifier()
    neigh.fit(X_train, y_train)
    y_pred = neigh.predict(X_test)
    cm = metrics.confusion_matrix(y_test, y_pred)
    neigh.fit(data_train, label_train)
    label_pred = neigh.predict(data_test)
    original_cm = metrics.confusion_matrix(label_test, label_pred)
    acc_score = metrics.balanced_accuracy_score(y_test, y_pred)
    original_acc_score = metrics.balanced_accuracy_score(label_test, label_pred)
    export_creation.save_classifier_metrics(acc_score, original_acc_score, cm, original_cm, PLOT_NAME, window_size, 
                                            n_bins, y_train, label_train, algorithm='KNN')
    return acc_score, original_acc_score

acc_scores = []
og_acc_scores = []

for param in PARAM:
    acc, og_acc = test_knn(data, labels, param[1], param[0])
    acc_scores.append(acc)
    og_acc_scores.append(og_acc)

In [12]:
export_creation.plot_accuracies(acc_scores, og_acc_scores, PLOT_NAME, algorithm='KNN')