In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

In [None]:
categories = np.load('/kaggle/input/timeseries/categories.npy')
categories.shape

In [None]:
valid_periods = np.load('/kaggle/input/timeseries/valid_periods.npy')
valid_periods.shape

Find out the shortest and longest interval in the time series.

In [None]:
min, max = (9999, -1), (0, -1)
for i, tuple in enumerate(valid_periods):
    diff = tuple[1] - tuple[0]
    if diff < min[0]: min = (diff, i)
    if diff > max[0]: max = (diff, i)
        
min, max

In [None]:
index_shortest = min[1]
index_longest = max[1]

In [None]:
training_data = np.load('/kaggle/input/timeseries/training_data.npy')
training_data.shape

In [None]:
training_data_df = pd.DataFrame(training_data)
training_data_df.info()

In [None]:
training_data_df = training_data_df.astype(np.float32)
training_data_df.info()
training_data_df.describe()

In [None]:
sample_first = training_data_df.T[0]
sample_shortest = training_data_df.T[index_shortest]
sample_longest = training_data_df.T[index_longest]

In [None]:
import matplotlib.pyplot as plt

plt.plot(sample_first)
plt.title('First Sample')
plt.show()

plt.plot(sample_shortest)
plt.title('Shortest Sample')
plt.show()

plt.plot(sample_longest)
plt.title('Longest Sample')
plt.show()

In [None]:
start_first, end_first = valid_periods[0]
sample_first_trimmed = sample_first[start_first:end_first]

plt.plot(sample_first_trimmed)
plt.title('First Sample')
plt.show()



start_shortest, end_shortest = valid_periods[index_shortest]
sample_shortest_trimmed = sample_shortest[start_shortest:end_shortest]

plt.plot(sample_shortest_trimmed)
plt.title('Shortest Sample')
plt.show()



start_longest, end_longest = valid_periods[index_longest]
sample_longest_trimmed = sample_longest[start_longest:end_longest]

plt.plot(sample_longest_trimmed)
plt.title('Longest Sample')
plt.show()


In [None]:
category_counts = np.unique(categories, return_counts=True)
for category, count in zip(category_counts[0], category_counts[1]):
    print(f"Category {category}: {count}")

In [None]:
lengths = valid_periods[:, 1] - valid_periods[:, 0]

intervals = [(0, 300), (301, 600), (601, 900), (901, 1200), (1201, 1500), 
             (1501, 1800), (1801, 2100), (2101, 2400), (2401, 2776)]

interval_counts = {category: {interval: 0 for interval in intervals} for category in 'ABCDEF'}

for category, length in zip(categories, lengths):
    for interval in intervals:
        if interval[0] <= length < interval[1]:
            interval_counts[category][interval] += 1
            break

interval_counts_df = pd.DataFrame(interval_counts)

interval_ranges = [f"{start}-{end}" for start, end in intervals]
interval_counts_df.index = interval_ranges

interval_counts_df

In [None]:
# Plot 10 time series for each category
fig, axes = plt.subplots(nrows=6, ncols=1, figsize=(12, 18))
for i, category in enumerate(np.unique(categories)):
    # Get the indices of the time series for this category
    indices = np.where(categories == category)[0]
    # Randomly choose 10 indices from this category
    chosen_indices = np.random.choice(indices, 10, replace=False)
    
    for ts_index in chosen_indices:
        # Find the valid period for this time series
        start, end = valid_periods[ts_index]
        # Plot the time series data for its valid period
        axes[i].plot(range(start, end), training_data[ts_index, start:end], label=f'Index {ts_index}')
    
    axes[i].set_title(f'Category {category}')
    axes[i].legend()

plt.xlabel('Time')
plt.ylabel('Value')
plt.tight_layout()
plt.show()

In [None]:
# Import the necessary function from statsmodels
from statsmodels.stats.multicomp import pairwise_tukeyhsd

# Prepare the data for Tukey's HSD test
data_for_tukey = np.array([(length, category) for length, category in zip(lengths, categories)])
lengths_for_tukey = data_for_tukey[:, 0].astype(float)
categories_for_tukey = data_for_tukey[:, 1]

# Perform Tukey's HSD test
tukey_result = pairwise_tukeyhsd(lengths_for_tukey, categories_for_tukey)
print(tukey_result)

In [None]:
mean_lengths = {}
for category in np.unique(categories):
    mean_lengths[category] = lengths[categories == category].mean()

print('MEAN LENGHTS')
mean_lengths

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
reshaped_categories = categories.reshape(-1, 1)
onehot_categories = encoder.fit_transform(reshaped_categories)

onehot_categories

In [None]:
from sklearn.model_selection import train_test_split

# Split the indices of the time series into training and validation sets
train_indices, val_indices = train_test_split(np.arange(len(training_data_df)), test_size=0.2, random_state=42, stratify=categories)

# Use these indices to create training and validation sets
train_set = training_data_df.iloc[train_indices]
valid_periods_train = valid_periods[train_indices]
onehot_categories_train = onehot_categories[train_indices]

val_set = training_data_df.iloc[val_indices]
valid_periods_val = valid_periods[val_indices]
onehot_categories_val = onehot_categories[val_indices]

train_set.shape, valid_periods_train.shape, onehot_categories_train.shape, val_set.shape, valid_periods_val.shape, onehot_categories_val.shape

In [None]:
def calculate_padding_length(sq, w, t, s):
    cycle = w + t
    if sq <= cycle: return cycle - sq
    else:
        m = (sq - cycle) % s
        if m == 0: return m
        else: return s - m

In [None]:
def create_sequences(df, valid_periods, categories, window, telescope, stride=1):

    input_sequences = []
    input_categories = []
    output_sequences = []
    

    for i in range(len(valid_periods)):
        start, end = valid_periods[i]
        category = categories[i]
        
        sequence = df.iloc[i, start:end].values
        
        padding_length = calculate_padding_length(len(sequence), window, telescope, stride)
        sequence = np.pad(sequence, (padding_length, 0), mode='constant', constant_values=0)

        for j in range(0, len(sequence) - window - telescope + 1, stride):
            input_seq = sequence[j:(j + window)]
            output_seq = sequence[(j + window):(j + window + telescope)]
            
            input_sequences.append(input_seq)
            input_categories.append(category)
            output_sequences.append(output_seq)

    return np.array(input_sequences), np.array(input_categories), np.array(output_sequences)

In [None]:
window = 100
telescope = 9
stride = 20

In [None]:
X_train, X_train_category, y_train = create_sequences(train_set, valid_periods_train, onehot_categories_train, window, telescope, stride)
X_val, X_val_category, y_val = create_sequences(val_set, valid_periods_val, onehot_categories_val, window, telescope, stride)
X_train.shape, X_train_category.shape, y_train.shape, X_val.shape, X_val_category.shape, y_val.shape

In [None]:
input_shape = (window, 1)
category_shape = (6, )
output_shape = (telescope, 1)
batch_size = 64
epochs = 200

In [None]:
import tensorflow.keras.layers as tfkl
import tensorflow as tf

def build_CONV_LSTM_model_with_categories(input_shape, category_shape, output_shape):
    
    # Time series input
    time_series_input = tfkl.Input(shape=input_shape, name='time_series_input')
    x = tfkl.Bidirectional(tfkl.LSTM(64, return_sequences=True), name='bidirectional_lstm')(time_series_input)
    x = tfkl.Conv1D(128, 3, padding='same', activation='relu')(x)

    # Category input
    category_input = tfkl.Input(shape=category_shape, name='category_input')
    category_processed = tfkl.Dense(20, activation='relu')(category_input)

    # Combine the processed inputs
    combined = tfkl.Concatenate()([x, tfkl.RepeatVector(input_shape[0])(category_processed)])

    # Continue with convolutional layers
    x = tfkl.Conv1D(128, 3, padding='same', activation='relu')(combined)
    output_layer = tfkl.Conv1D(output_shape[1], 3, padding='same')(x)
    crop_size = output_layer.shape[1] - output_shape[0]
    output_layer = tfkl.Cropping1D((0, crop_size))(output_layer)

    # Construct the model
    model = tf.keras.Model(inputs=[time_series_input, category_input], outputs=output_layer, name='CONV_LSTM_with_Categories')
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam())

    return model

In [None]:
model = build_CONV_LSTM_model_with_categories(input_shape, category_shape, output_shape)

In [None]:
model.summary()

from tensorflow import keras as tfk
tfk.utils.plot_model(model, expand_nested=True, show_shapes=True)

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    mode='min', 
    patience=15, 
    restore_best_weights=True
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_loss', 
    mode='min', 
    patience=10, 
    factor=0.1, 
    min_lr=1e-5
)

history = model.fit(
    x = [X_train, X_train_category],
    y = y_train,
    batch_size = batch_size,
    epochs = epochs,
    validation_data = ([X_val, X_val_category], y_val),
    callbacks = [
        early_stopping,
        reduce_lr
    ]
).history

In [None]:
best_epoch = np.argmin(history['val_loss'])
plt.figure(figsize=(17,4))
plt.plot(history['loss'], label='Training loss', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation loss', alpha=.9, color='#5a9aa5')
plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
plt.title('Mean Squared Error (Loss)')
plt.legend()
plt.grid(alpha=.3)
plt.show()

plt.figure(figsize=(18,3))
plt.plot(history['lr'], label='Learning Rate', alpha=.8, color='#ff7f0e')
plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
plt.legend()
plt.grid(alpha=.3)
plt.show()

In [None]:
preds = model.predict([X_val, X_val_category])

In [None]:
preds.shape

In [None]:
model.save('conv_lstm_tp9_ws100')

In [None]:
!zip -r conv_lstm_tp9_ws100.zip /kaggle/working/conv_lstm_tp9_ws100