# Load and transform TwoPattern dataset
http://timeseriesclassification.com/description.php?Dataset=TwoPatterns

## Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from time import time

time_index = int(time())

OUTPUT_FILENAME = f"two_pattern_diff_lens.pkl"
DIFFERENT_LENS = True
NO_TEST_SAMPLES = 4000 # there are total 5000 samples

## Prepare and save dataset

In [2]:
converted_lines = []
for filename in ["test.txt", "train.txt"]:
    with open (f"data_two_pat/{filename}", "r") as f:
        lines = f.readlines()
        for l in lines:
            floats = list(map(float, l.split()))
            converted_lines.append(floats)

sample_ids = list(range(len(converted_lines)))
true_labels = [int(X[0])-1 for X in converted_lines] # labels should start from 0
all_X_samples = [np.array(X[1:]).reshape(-1,1) for X in converted_lines]

# if different lenghts, cut some of samples
if DIFFERENT_LENS:
    new_samples = []
    for X in all_X_samples:
        cut = np.random.randint(1,40)
        new = X[:-cut]
        new_samples.append(new)
        if len(new) == 0:
            raise Exception
    all_X_samples = new_samples # overwrite sample list

sample_lengths = [sample.shape[0] for sample in all_X_samples]

test_ids = list(range(NO_TEST_SAMPLES))
train_ids = list(range(4000,5000))

In [4]:
labels_df = pd.DataFrame({
    "sample_id": sample_ids,
    "true_label": true_labels,
    "sample_len": sample_lengths
})

indices_splits_lst = [{"test_samples_ids": test_ids, "train_samples_ids": train_ids}]

In [5]:
data_filename = OUTPUT_FILENAME
output_path = f"data_two_pat/{data_filename}"

metadata = {
    "N_TRAIN_SAMPLES_PER_MODEL": len(train_ids),
    "N_TEST_SAMPLES_PER_MODEL": len(test_ids),
    "MAX_SAMPLE_LEN": max(sample_lengths),
    "MIN_SAMPLE_LEN": min(sample_lengths)
}

data = {
    'generating_model': "SIM_TwoPat",
    'data_filename': data_filename,
    'time_index': time_index,
    'models_lst': None,
    'labels_df': labels_df,
    'all_X_samples': all_X_samples,
    'indices_splits_lst': indices_splits_lst,
    'metadata': metadata,
    'description': "http://timeseriesclassification.com/description.php?Dataset=TwoPatterns"
}

with open(output_path, 'wb') as f:
    pickle.dump(data, f)
    print(f"Data saved to {output_path}.")

Data saved to data_two_pat/two_pattern_diff_lens.pkl.


## EDA