<a href="https://colab.research.google.com/github/nvinogradskaya/DL_HW4_RNN/blob/main/Hybrid_LSTM-v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import os
import tensorflow as tf
import pandas as pd
import uuid
import shutil
import matplotlib.pyplot as plt
import pickle

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Concatenate, Dropout, LayerNormalization
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# ======== parameters ========
MAX_USERS = 5
SEQ_LENGTH = 10
EMBEDDING_DIM = 16
LSTM_UNITS = 64
BATCH_SIZE = 64
EPOCHS = 10
TEST_SIZE = 0.3

DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Data/"
SAVE_PATH = "/content/drive/My Drive/Colab Notebooks/contrastive_results-v3/"
SEQ_SAVE_PATH = os.path.join(SAVE_PATH, 'sequences/')
os.makedirs(SAVE_PATH, exist_ok=True)
os.makedirs(SEQ_SAVE_PATH, exist_ok=True)

Mounted at /content/drive


In [None]:
from tqdm import tqdm
def load_and_preprocess_data(data_path, max_users=MAX_USERS):
    data = []
    user_dirs = sorted(os.listdir(data_path))[:max_users]
    for user in tqdm(user_dirs, desc="Loading users"):
        traj_dir = os.path.join(data_path, user, 'Trajectory')
        traj_files = sorted([f for f in os.listdir(traj_dir) if f.endswith('.plt')])
        for traj_file in traj_files:
            df = pd.read_csv(
                os.path.join(traj_dir, traj_file),
                skiprows=6,
                header=None,
                usecols=[0, 1, 3, 5, 6],
                names=['lat', 'lon', 'alt', 'date', 'time']
            )
            df['user'] = user
            data.append(df)

    df = pd.concat(data, ignore_index=True)
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.sort_values(by=['user', 'datetime'], inplace=True)
    df = df[(df['lat'] != 0) & (df['lon'] != 0)].ffill()

    scaler = MinMaxScaler()
    df[['lat', 'lon', 'alt']] = scaler.fit_transform(df[['lat', 'lon', 'alt']])

    df['hour_sin'] = np.sin(2 * np.pi * df['datetime'].dt.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['datetime'].dt.hour / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['datetime'].dt.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['datetime'].dt.dayofweek / 7)

    user_ids = {user: idx for idx, user in enumerate(df['user'].unique())}
    df['user_id'] = df['user'].map(user_ids)

    return df, user_ids, scaler

In [None]:
df, user_ids, scaler = load_and_preprocess_data(DATA_PATH)

Loading users: 100%|██████████| 5/5 [00:18<00:00,  3.69s/it]


In [None]:
# ======== Sequence Creation ========
def create_sequences_and_save(df, user_ids, seq_length, test_size=0.3, save_path='./seq_data'):
    os.makedirs(save_path, exist_ok=True)
    features = ['lat', 'lon', 'alt', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos']
    targets = ['lat', 'lon']

    for user, user_df in tqdm(df.groupby('user'), desc="Creating sequences"):
        uid = user_ids[user]
        user_df = user_df.reset_index(drop=True)
        split_idx = int(len(user_df) * (1 - test_size))
        if split_idx <= seq_length:
            continue

        def process_chunk(data, is_train=True):
            window_size = seq_length + 1
            data_values = data[features].values
            if len(data_values) < window_size:
                return
            X = np.lib.stride_tricks.sliding_window_view(data_values, (window_size, data_values.shape[1])).squeeze(axis=1)
            X = X[:, :-1]
            y = data[targets].values[seq_length:]
            chunk_size = 1000
            for i in range(0, len(X), chunk_size):
                save_chunk(X[i:i+chunk_size], y[i:i+chunk_size], is_train)

        def save_chunk(X, y, is_train):
            suffix = 'train' if is_train else 'test'
            chunk_id = uuid.uuid4().hex
            np.savez_compressed(
                os.path.join(save_path, f'user_{uid}_{suffix}_{chunk_id}.npz'),
                X=X,
                y=y,
                user_id=uid
            )

        process_chunk(user_df.iloc[:split_idx], is_train=True)
        process_chunk(user_df.iloc[split_idx-seq_length:], is_train=False)

In [None]:
shutil.rmtree(SEQ_SAVE_PATH, ignore_errors=True)
os.makedirs(SEQ_SAVE_PATH, exist_ok=True)
create_sequences_and_save(df, user_ids, SEQ_LENGTH, save_path=SEQ_SAVE_PATH)

Creating sequences: 100%|██████████| 5/5 [00:25<00:00,  5.03s/it]


In [None]:
# ======== Load Saved Sequences ========
def load_all_sequences_from_disk(save_path):
    X_train, X_test, y_train, y_test, users_train, users_test = [], [], [], [], [], []
    for fname in tqdm(sorted(os.listdir(save_path)), desc="Loading sequences"):
        if not fname.endswith('.npz'):
            continue
        split_type = 'train' if 'train' in fname else 'test'
        uid = int(fname.split('_')[1])
        data = np.load(os.path.join(save_path, fname))
        X, y = data['X'], data['y']
        if split_type == 'train':
            X_train.append(X); y_train.append(y); users_train.append(np.full(len(X), uid))
        else:
            X_test.append(X); y_test.append(y); users_test.append(np.full(len(X), uid))
    return (
        np.concatenate(X_train), np.concatenate(X_test),
        np.concatenate(y_train), np.concatenate(y_test),
        np.concatenate(users_train), np.concatenate(users_test)
    )

In [None]:
X_train, X_test, y_train, y_test, users_train, users_test = load_all_sequences_from_disk(SEQ_SAVE_PATH)

Loading sequences: 100%|██████████| 1460/1460 [00:16<00:00, 89.85it/s] 


In [None]:
# ======== Contrastive Embedding Learning ========
def create_triplets(X, user_ids):
    anchors, positives, negatives = [], [], []
    for uid in tqdm(np.unique(user_ids), desc="Creating triplets"):
        same_user_idx = np.where(user_ids == uid)[0]
        diff_user_idx = np.where(user_ids != uid)[0]
        if len(same_user_idx) < 2:
            continue
        for i in range(min(len(same_user_idx) - 1, 100)):
            a_idx, p_idx = same_user_idx[i], same_user_idx[i+1]
            n_idx = np.random.choice(diff_user_idx)
            anchors.append(X[a_idx])
            positives.append(X[p_idx])
            negatives.append(X[n_idx])
    return np.array(anchors), np.array(positives), np.array(negatives)

In [None]:
anchors, positives, negatives = create_triplets(X_train, users_train)

Creating triplets: 100%|██████████| 5/5 [00:00<00:00, 242.49it/s]


In [None]:
def contrastive_model(input_shape, embedding_dim):
    inp = Input(shape=input_shape)
    x = LSTM(32)(inp)
    x = Dense(embedding_dim)(x)
    model = Model(inputs=inp, outputs=x)
    return model

def triplet_loss_fn(a, p, n, margin=1.0):
    ap_dist = tf.reduce_sum(tf.square(a - p), axis=1)
    an_dist = tf.reduce_sum(tf.square(a - n), axis=1)
    return tf.reduce_mean(tf.maximum(ap_dist - an_dist + margin, 0.0))

In [None]:
triplet_encoder = contrastive_model(X_train.shape[1:], EMBEDDING_DIM)
optimizer = tf.keras.optimizers.Adam(1e-3)

In [None]:
for epoch in range(5):
    with tf.GradientTape() as tape:
        emb_a = triplet_encoder(anchors)
        emb_p = triplet_encoder(positives)
        emb_n = triplet_encoder(negatives)
        loss = triplet_loss_fn(emb_a, emb_p, emb_n)
    grads = tape.gradient(loss, triplet_encoder.trainable_variables)
    optimizer.apply_gradients(zip(grads, triplet_encoder.trainable_variables))
    print(f"сontrastive epoch {epoch+1} // loss = {loss.numpy():.4f}")

сontrastive epoch 1 // loss = 0.1677
сontrastive epoch 2 // loss = 0.1434
сontrastive epoch 3 // loss = 0.1240
сontrastive epoch 4 // loss = 0.1090
сontrastive epoch 5 // loss = 0.0967


In [None]:
user_embeddings_matrix = {}
for uid in np.unique(users_train):
    user_seqs = X_train[users_train == uid]
    user_embs = triplet_encoder(user_seqs)
    user_embeddings_matrix[uid] = tf.reduce_mean(user_embs, axis=0).numpy()

user_embeddings_train = np.array([user_embeddings_matrix[uid] for uid in users_train])
user_embeddings_test = np.array([user_embeddings_matrix[uid] for uid in users_test])

In [None]:
class CombinedDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, X, user_embs, y, batch_size=64):
        self.X = X
        self.user_embs = user_embs
        self.y = y
        self.batch_size = batch_size
    def __len__(self):
        return int(np.ceil(len(self.X) / self.batch_size))
    def __getitem__(self, idx):
        start = idx * self.batch_size
        end = min((idx + 1) * self.batch_size, len(self.X))
        X_batch = self.X[start:end]
        user_emb_batch = self.user_embs[start:end]
        y_batch = self.y[start:end]
        user_emb_expanded = np.repeat(user_emb_batch[:, np.newaxis, :], X_batch.shape[1], axis=1)
        combined_X = np.concatenate([X_batch, user_emb_expanded], axis=-1)
        return combined_X.astype(np.float32), y_batch.astype(np.float32)

def build_lstm_model(input_shape):
    seq_input = Input(shape=input_shape)
    x = LSTM(LSTM_UNITS, return_sequences=False)(seq_input)
    x = LayerNormalization()(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(2, activation='linear')(x)
    model = Model(inputs=seq_input, outputs=output)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

In [None]:
train_gen = CombinedDataGenerator(X_train, user_embeddings_train, y_train, batch_size=BATCH_SIZE)
val_gen = CombinedDataGenerator(X_test, user_embeddings_test, y_test, batch_size=BATCH_SIZE)
input_shape = (X_train.shape[1], X_train.shape[2] + EMBEDDING_DIM)

In [None]:
model = build_lstm_model(input_shape)

In [None]:
callbacks = [
    ModelCheckpoint(os.path.join(SAVE_PATH, 'best_model.keras'), save_best_only=True, monitor='val_loss'),
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
]
history = model.fit(train_gen, validation_data=val_gen, epochs=5, callbacks=callbacks)

Epoch 1/5
[1m15917/15917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 10ms/step - loss: 0.0025 - mae: 0.0288 - val_loss: 0.0034 - val_mae: 0.0492
Epoch 2/5
[1m15917/15917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 11ms/step - loss: 6.2599e-04 - mae: 0.0124 - val_loss: 0.0027 - val_mae: 0.0467
Epoch 3/5
[1m15917/15917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 10ms/step - loss: 3.4207e-04 - mae: 0.0096 - val_loss: 0.0020 - val_mae: 0.0376
Epoch 4/5
[1m15917/15917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 10ms/step - loss: 2.5655e-04 - mae: 0.0081 - val_loss: 0.0021 - val_mae: 0.0381
Epoch 5/5
[1m15917/15917[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 10ms/step - loss: 2.1453e-04 - mae: 0.0075 - val_loss: 0.0023 - val_mae: 0.0403


In [None]:
model.load_weights(os.path.join(SAVE_PATH, 'best_model.keras'))
def predict_with_generator(model, X, user_embs, batch_size=64, save_path=None):
    generator = CombinedDataGenerator(X, user_embs, np.zeros((len(user_embs), 2)), batch_size=batch_size)
    preds = []

    for i, (X_batch, _) in enumerate(tqdm(generator, total=len(generator), desc="Predicting")):
        try:
            # Проверка на пустой батч
            if X_batch[0].shape[0] == 0:
                print(f"⚠️ Пустой батч {i}, пропускаем.")
                continue

            batch_pred = model.predict(X_batch, verbose=0)
            preds.append(batch_pred)

            # Периодическое сохранение
            if save_path and (i + 1) % 1000 == 0:
                partial_preds = np.vstack(preds)
                np.save(f"{save_path}_partial_{i+1}.npy", partial_preds)
        except Exception as e:
            print(f"⚠️ Ошибка на батче {i}: {e}")
            continue

    if preds:
        final_preds = np.vstack(preds)
        if save_path:
            np.save(f"{save_path}_final.npy", final_preds)
            print(f"✅ Итоговое предсказание сохранено в: {save_path}_final.npy")
        return final_preds
    else:
        print("⚠️ Нет предсказаний — возвращаю пустой массив.")
        return np.empty((0, 2))

In [None]:
y_pred = predict_with_generator(model, X_test, user_embeddings_test, save_path=os.path.join(SAVE_PATH, "y_pred"))

In [None]:
def calculate_metrics(preds, targets):
    ade = np.mean(np.linalg.norm(preds - targets, axis=-1))
    fde = np.mean(np.linalg.norm(preds - targets, axis=-1))  # Final error is the same in single-step prediction
    within_100m = np.mean(np.linalg.norm(preds - targets, axis=-1) < 0.001)  # 0.001 ≈ 100 meters
    return ade, fde, within_100m

ade, fde, acc_within_100m = calculate_metrics(y_pred, y_test)
print(f"ADE: {ade:.4f}, FDE: {fde:.4f}, % predictions < 100m: {acc_within_100m:.4f}")

NameError: name 'y_pred' is not defined

In [None]:
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Training Losses')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()