# Load and transform TwoPattern dataset
http://timeseriesclassification.com/description.php?Dataset=TwoPatterns

## Setup

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from time import time
from tqdm import tqdm

time_index = int(time())

DATA_DIR = "data_two_pattern"
DIFFERENT_LENS = True
UNBALANCED = False # if set to unbalanced, samples are shuffled so that data is unbalanced 

NO_TEST_SAMPLES = 800 # there are total 5000 samples
NO_TRAIN_SAMPLES = 200
if NO_TEST_SAMPLES + NO_TRAIN_SAMPLES > 5000:
    raise Exception

infix = "dif" if DIFFERENT_LENS else "eq"
infix2 = "unbal_" if UNBALANCED else ""
OUTPUT_FILENAME = f"two_pattern_{infix}_lens_{infix2}ntrain{NO_TRAIN_SAMPLES}.pkl"

## Prepare and save dataset

In [38]:
converted_lines = []
for filename in ["test.txt", "train.txt"]:
    with open (f"{DATA_DIR}/{filename}", "r") as f:
        lines = f.readlines()
        for l in lines:
            floats = list(map(float, l.split()))
            converted_lines.append(floats)

sample_ids = list(range(len(converted_lines)))
true_labels = [int(X[0])-1 for X in converted_lines] # labels should start from 0
all_X_samples = [np.array(X[1:]).reshape(-1,1) for X in converted_lines]

In [39]:
all_X_samples[0].shape

(128, 1)

In [40]:
# if different lenghts, cut some of samples
if DIFFERENT_LENS:
    new_samples = []
    for X in all_X_samples:
        cut = np.random.randint(1,40)
        new = X[:-cut]
        new_samples.append(new)
        if len(new) == 0:
            raise Exception
    all_X_samples = new_samples # overwrite sample list

sample_lengths = [sample.shape[0] for sample in all_X_samples]

labels_df = pd.DataFrame({
    "sample_id": sample_ids,
    "true_label": true_labels,
    "sample_len": sample_lengths
})

In [41]:
labels_df.head()

Unnamed: 0,sample_id,true_label,sample_len
0,0,1,99
1,1,2,99
2,2,0,126
3,3,2,119
4,4,1,104


In [42]:
test_samples_ids = []
train_samples_ids = []
for label, sub_df in labels_df.groupby("true_label"):
    if UNBALANCED:
        frac = [0.1, 0.1, 0.2, 0.6]
        train_ids = sub_df.sample(int(np.ceil(frac[label]*NO_TRAIN_SAMPLES/4))).index.values
    else:
        train_ids = sub_df.sample(int(NO_TRAIN_SAMPLES/4)).index.values
    
    test_ids = sub_df.drop(train_ids).sample(int(NO_TEST_SAMPLES/4)).index.values
    train_samples_ids.extend(train_ids)
    test_samples_ids.extend(test_ids)
    
indices_splits_lst = [{"test_samples_ids": test_samples_ids, "train_samples_ids": train_samples_ids}]

In [43]:
output_path = f"{DATA_DIR}/{OUTPUT_FILENAME}_{time_index}"

metadata = {
    "NO_TRAIN_SAMPLES": len(train_ids),
    "NO_TEST_SAMPLES": len(test_ids),
    "MAX_SAMPLE_LEN": max(sample_lengths),
    "MIN_SAMPLE_LEN": min(sample_lengths)
}

data = {
    'generating_model': "SIM_TwoPat",
    'data_filename': output_path,
    'time_index': time_index,
    'models_lst': None,
    'labels_df': labels_df,
    'all_X_samples': all_X_samples,
    'indices_splits_lst': indices_splits_lst,
    'metadata': metadata,
    'description': "http://timeseriesclassification.com/description.php?Dataset=TwoPatterns"
}

with open(output_path, 'wb') as f:
    pickle.dump(data, f)
    print(f"Data saved to {output_path}.")

Data saved to data_two_pattern/two_pattern_dif_lens_ntrain200.pkl_1661938075.


## EDA

### Is dataset balanced?

In [44]:
labels_df.set_index("sample_id").iloc[train_samples_ids]["true_label"].value_counts()

0    50
1    50
2    50
3    50
Name: true_label, dtype: int64

In [45]:
labels_df.set_index("sample_id").iloc[test_samples_ids]["true_label"].value_counts()

0    200
1    200
2    200
3    200
Name: true_label, dtype: int64

In [46]:
labels_df.set_index("sample_id").iloc[test_samples_ids]

Unnamed: 0_level_0,true_label,sample_len
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
318,0,103
1579,0,120
4286,0,97
31,0,98
611,0,97
...,...,...
2354,3,118
3495,3,95
4959,3,100
4603,3,122


In [47]:
all_X_samples[0].shape

(99, 1)

### Plots of samples

In [48]:
if not UNBALANCED and not DIFFERENT_LENS: # create plots only for balanced data (cant plot easily different lengths)
    labels = ["DD", "UD", "DU", "UU"]
    plt.rcParams["figure.figsize"] = (40, 16)
    plt.rcParams['font.size'] = 30
    fig, axes = plt.subplots(2,2, sharey="all")
    plt.suptitle(f"Two samples of each class")
    for i, df in tqdm(labels_df.groupby("true_label"), desc = "Preparing plots"):
        ax = axes.flat[i]
        ids = df["sample_id"].sample(2).values
        samples = np.array([all_X_samples[id] for id in ids])
        ax.plot(samples.reshape((2, -1)).T, lw = 3)
        ax.set_title(f"({labels[i]}) label = {i}")
        ax.grid()
        
    plt.savefig(f"./plots/models_twoPat_{time_index}.jpg")
    plt.show()