<a href="https://colab.research.google.com/github/nvinogradskaya/DL_HW4_RNN/blob/main/geolifev2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding, Lambda, Concatenate, RepeatVector
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive

In [10]:
drive.mount('/content/drive')

DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Data/"
SEQ_LENGTH = 10
EMBEDDING_DIM = 16
BATCH_SIZE = 64
EPOCHS = 5

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
def load_and_preprocess_data(data_path, max_users=10):
    data = []
    user_dirs = sorted(os.listdir(data_path))[:max_users]

    for user in user_dirs:
        traj_dir = os.path.join(data_path, user, 'Trajectory')
        traj_files = [f for f in os.listdir(traj_dir) if f.endswith('.plt')]

        for traj_file in traj_files:
            df = pd.read_csv(
                os.path.join(traj_dir, traj_file),
                skiprows=6,
                header=None,
                usecols=[0, 1, 3, 5, 6],
                names=['lat', 'lon', 'alt', 'date', 'time']
            )
            df['user'] = user
            data.append(df)

    df = pd.concat(data, ignore_index=True)
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.sort_values(by=['user', 'datetime'], inplace=True)

    # Фильтрация и нормализация
    df = df[(df['lat'] != 0) & (df['lon'] != 0)].ffill()
    scaler = MinMaxScaler()
    df[['lat', 'lon', 'alt']] = scaler.fit_transform(df[['lat', 'lon', 'alt']])

    # Временные признаки
    df['hour_sin'] = np.sin(2 * np.pi * df['datetime'].dt.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['datetime'].dt.hour / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['datetime'].dt.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['datetime'].dt.dayofweek / 7)

    # Персонализированные эмбеддинги
    user_ids = {user: idx for idx, user in enumerate(df['user'].unique())}
    df['user_id'] = df['user'].map(user_ids)

    return df, user_ids, scaler

In [5]:
def create_sequences(df, user_ids, seq_length):
    features = ['lat', 'lon', 'alt', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos']
    targets = ['lat', 'lon']

    X, y, users = [], [], []
    for user, group in df.groupby('user'):
        user_data = group[features].values
        user_targets = group[targets].values
        user_id = user_ids[user]

        for i in range(len(user_data) - seq_length):
            X.append(user_data[i:i+seq_length])
            y.append(user_targets[i+seq_length])
            users.append(user_id)

    return np.array(X), np.array(y), np.array(users)

In [6]:
class ContrastiveLoss(tf.keras.losses.Loss):
    def __init__(self, margin=1.0):
        super().__init__()
        self.margin = margin

    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        anchor, positive = y_pred[:,0], y_pred[:,1]
        distances = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
        loss = y_true * distances + (1 - y_true) * tf.maximum(self.margin - distances, 0)
        return tf.reduce_mean(loss)

In [7]:
def build_model(seq_length, num_features, num_users, embedding_dim):
    # Входные слои
    user_input = Input(shape=(1,), name='user_input')
    traj_input = Input(shape=(seq_length, num_features), name='traj_input')

    # Персонализированные эмбеддинги
    user_embedding = Embedding(num_users, embedding_dim)(user_input)
    user_embedding = Lambda(lambda x: tf.squeeze(x, axis=1))(user_embedding)
    user_embedding = RepeatVector(seq_length)(user_embedding)

    # Объединение признаков
    merged = Concatenate(axis=-1)([traj_input, user_embedding])

    # LSTM сеть
    x = LSTM(64, return_sequences=True)(merged)
    x = Dropout(0.2)(x)
    x = LSTM(32)(x)
    x = Dropout(0.2)(x)

    # Выходной слой
    output = Dense(2, activation='linear')(x)

    return Model(inputs=[traj_input, user_input], outputs=output)

In [8]:
def main():
    # Загрузка данных
    df, user_ids, scaler = load_and_preprocess_data(DATA_PATH)

    # Создание последовательностей
    X, y, users = create_sequences(df, user_ids, SEQ_LENGTH)

    # Разделение данных
    X_train, X_test, y_train, y_test, users_train, users_test = train_test_split(
        X, y, users, test_size=0.2, random_state=42
    )

    # Построение модели
    model = build_model(
        seq_length=SEQ_LENGTH,
        num_features=X_train.shape[-1],
        num_users=len(user_ids),
        embedding_dim=EMBEDDING_DIM
    )

    # Компиляция модели
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )

    # Обучение
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    )

    history = model.fit(
        [X_train, users_train],
        y_train,
        validation_split=0.2,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping]
    )

    # Оценка
    test_loss, test_mae = model.evaluate([X_test, users_test], y_test)
    print(f"Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")

In [11]:
if __name__ == "__main__":
    main()

Epoch 1/5
[1m18459/18459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m325s[0m 17ms/step - loss: 0.0036 - mae: 0.0313 - val_loss: 4.6659e-05 - val_mae: 0.0033
Epoch 2/5
[1m18459/18459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 17ms/step - loss: 2.2284e-04 - mae: 0.0067 - val_loss: 5.5401e-05 - val_mae: 0.0038
Epoch 3/5
[1m18459/18459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 17ms/step - loss: 2.0788e-04 - mae: 0.0061 - val_loss: 2.7046e-05 - val_mae: 0.0030
Epoch 4/5
[1m18459/18459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 17ms/step - loss: 2.0300e-04 - mae: 0.0059 - val_loss: 3.8089e-05 - val_mae: 0.0031
Epoch 5/5
[1m18459/18459[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m319s[0m 17ms/step - loss: 2.0004e-04 - mae: 0.0058 - val_loss: 3.8482e-05 - val_mae: 0.0035
[1m11537/11537[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 5ms/step - loss: 2.7367e-05 - mae: 0.0030
Test Loss: 0.0000, Test MAE: 0.0030
