<a href="https://colab.research.google.com/github/nvinogradskaya/DL_HW4_RNN/blob/main/Untitled3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [32]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import math
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import random
from datetime import datetime
from google.colab import drive

In [2]:
drive.mount('/content/drive')

# ========================
# 2. Параметры
# ========================
MAX_USERS = 3
SEQ_LENGTH = 10
EMBEDDING_DIM = 16
LSTM_UNITS = 64
BATCH_SIZE = 128
EPOCHS = 3
TEST_SIZE = 0.3
DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Data/"
GRID_SIZE_LARGE = 0.0045
GRID_SIZE_SMALL = 0.000045

Mounted at /content/drive


In [3]:
# ========================
# 3. Утилиты
# ========================
def haversine(coord1, coord2):
    R = 6371000  # Earth radius in meters
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    d_phi = phi2 - phi1
    d_lambda = math.radians(lon2 - lon1)
    a = math.sin(d_phi / 2)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(d_lambda / 2)**2
    return R * 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

def sinusoidal_time_embedding(timestamps, dim=16):
    emb = []
    for ts in timestamps:
        emb_i = []
        for i in range(dim // 2):
            angle = ts.timestamp() / (10000 ** (2 * i / dim))
            emb_i.append(math.sin(angle))
            emb_i.append(math.cos(angle))
        emb.append(emb_i)
    return torch.tensor(emb, dtype=torch.float)

def create_grid(lat, lon, grid_size):
    return int(lat / grid_size), int(lon / grid_size)

In [4]:
# ========================
# 4. Загрузка и обработка данных Geolife
# ========================
def load_and_preprocess_data(data_path, max_users=MAX_USERS):
    data = []
    user_dirs = sorted(os.listdir(data_path))[:max_users]
    for user in tqdm(user_dirs, desc="Loading users"):
        traj_dir = os.path.join(data_path, user, 'Trajectory')
        traj_files = sorted([f for f in os.listdir(traj_dir) if f.endswith('.plt')])
        for traj_file in traj_files:
            df = pd.read_csv(
                os.path.join(traj_dir, traj_file),
                skiprows=6,
                header=None,
                usecols=[0, 1, 3, 5, 6],
                names=['lat', 'lon', 'alt', 'date', 'time']
            )
            df['user'] = user
            data.append(df)

    df = pd.concat(data, ignore_index=True)
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.sort_values(by=['user', 'datetime'], inplace=True)
    df = df[(df['lat'] != 0) & (df['lon'] != 0)].ffill()

    scaler = MinMaxScaler()
    df[['lat', 'lon', 'alt']] = scaler.fit_transform(df[['lat', 'lon', 'alt']])

    df['hour_sin'] = np.sin(2 * np.pi * df['datetime'].dt.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['datetime'].dt.hour / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['datetime'].dt.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['datetime'].dt.dayofweek / 7)

    user_ids = {user: idx for idx, user in enumerate(df['user'].unique())}
    df['user_id'] = df['user'].map(user_ids)

    df['grid_large_x'], df['grid_large_y'] = zip(*df.apply(lambda row: create_grid(row['lat'], row['lon'], GRID_SIZE_LARGE), axis=1))
    df['grid_small_x'], df['grid_small_y'] = zip(*df.apply(lambda row: create_grid(row['lat'], row['lon'], GRID_SIZE_SMALL), axis=1))

    return df, user_ids, scaler

In [5]:
# ========================
# 5. Подготовка последовательностей
# ========================
def prepare_sequences(df, seq_length=SEQ_LENGTH):
    grouped = df.groupby('user_id')
    data = []
    for user_id, group in grouped:
        group = group.reset_index(drop=True)
        if len(group) < seq_length + 1:
            continue
        for i in range(len(group) - seq_length):
            seq = group.iloc[i:i+seq_length+1]
            traj = list(zip(seq['lat'], seq['lon'], seq['datetime']))
            data.append((user_id, traj))
    return data

In [6]:
# ========================
# 6. Контрастивная модель и обучение
# ========================
class ContrastiveEmbeddingModel(nn.Module):
    def __init__(self, num_users, emb_dim):
        super().__init__()
        self.embeddings = nn.Embedding(num_users, emb_dim)

    def forward(self, user_ids):
        return self.embeddings(user_ids)

def contrastive_loss(z_i, z_j, temperature=0.5):
    z_i = nn.functional.normalize(z_i, dim=1)
    z_j = nn.functional.normalize(z_j, dim=1)
    sim_matrix = torch.matmul(z_i, z_j.T) / temperature
    labels = torch.arange(len(z_i)).to(z_i.device)
    return nn.CrossEntropyLoss()(sim_matrix, labels)

In [33]:
# ========================
# 7. Dataset с полным эмбеддингом
# ========================
class TrajectoryDataset(Dataset):
    def __init__(self, data, user_embeddings):
        self.data = data
        self.user_embeddings = user_embeddings

    def __getitem__(self, idx):
        user_id, traj = self.data[idx]
        coords = torch.tensor([(x[0], x[1]) for x in traj], dtype=torch.float32)
        times = [x[2] for x in traj]

        # Генерация признаков
        time_emb = sinusoidal_time_embedding(times)
        user_emb = self.user_embeddings[user_id].repeat(len(traj), 1)

        # Объединение признаков [coords(2) + time_emb(16) + user_emb(16) = 34]
        inputs = torch.cat([coords, time_emb, user_emb], dim=1)
        return inputs[:-1], coords[1:]

In [8]:
# ========================
# 8. Крупномасштабная модель (LSTM)
# ========================
class MacroLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)  # lat, lon крупной ячейки

    def forward(self, x):
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return out

In [34]:
# ========================
# 9. Мелкомасштабная модель (Transformer)
# ========================
class MicroTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads=2, num_layers=2):  # 34 % 2 = 0
        super().__init__()
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=num_heads,
            dim_feedforward=hidden_dim
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(input_dim, 2)

    def forward(self, x):
        x = x.permute(1, 0, 2)  # [seq_len, batch, features]
        out = self.transformer(x)
        return self.fc(out[-1])

In [10]:
# ========================
# 10. Интегрированная модель
# ========================
class DualScaleModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_heads=4, num_layers=2):
        super().__init__()
        self.macro = MacroLSTM(input_dim, hidden_dim)
        self.micro = MicroTransformer(input_dim, hidden_dim, num_heads, num_layers)

    def forward(self, x):
        macro_out = self.macro(x)
        micro_out = self.micro(x)
        return macro_out, micro_out

In [11]:
# ========================
# 11. Функции обучения и валидации
# ========================
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        macro_out, micro_out = model(x)
        loss = criterion(micro_out, y[:, -1])  # Последняя точка
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            _, micro_out = model(x)
            loss = criterion(micro_out, y[:, -1])
            total_loss += loss.item()
            all_preds.append(micro_out.cpu().numpy())
            all_targets.append(y[:, -1].cpu().numpy())
    return total_loss / len(dataloader), np.vstack(all_preds), np.vstack(all_targets)

In [12]:
# ========================
# 12. Метрики
# ========================
def compute_metrics(preds, targets):
    mse = mean_squared_error(targets, preds)
    ade = np.mean([haversine(p, t) for p, t in zip(preds, targets)])
    fde = np.mean([haversine(preds[i], targets[i]) for i in range(len(preds))])
    acc_100m = np.mean([haversine(p, t) < 100 for p, t in zip(preds, targets)]) * 100
    return {
        "MSE": mse,
        "ADE": ade,
        "FDE": fde,
        "<100m %": acc_100m
    }

In [35]:
# 13. Запуск обучения и тестирования
# ========================
from torch.utils.data import DataLoader, TensorDataset, random_split
import matplotlib.pyplot as plt

# Загрузка данных и подготовка последовательностей
df, user_ids, scaler = load_and_preprocess_data(DATA_PATH)

Loading users: 100%|██████████| 3/3 [00:06<00:00,  2.14s/it]


In [15]:
data = prepare_sequences(df)

In [23]:
def create_triplets(data):
    anchors, positives, negatives = [], [], []
    user_trajs = {}
    for uid, traj in data:
        if uid not in user_trajs:
            user_trajs[uid] = []
        user_trajs[uid].append(traj)

    for uid, trajs in user_trajs.items():
        if len(trajs) < 2:
            continue
        for i in range(len(trajs)-1):
            anchor = trajs[i]
            positive = trajs[i+1]
            other_uids = [u for u in user_trajs if u != uid]
            neg_uid = random.choice(other_uids)
            negative = random.choice(user_trajs[neg_uid])
            anchors.append(anchor)
            positives.append(positive)
            negatives.append(negative)
    return anchors, positives, negatives

In [24]:
class TripletEncoder(nn.Module):
    def __init__(self, input_dim, emb_dim):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, 32, batch_first=True)
        self.fc = nn.Linear(32, emb_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out[:, -1])

def triplet_loss(a, p, n, margin=1.0):
    ap_dist = (a - p).pow(2).sum(1)
    an_dist = (a - n).pow(2).sum(1)
    loss = torch.relu(ap_dist - an_dist + margin)
    return loss.mean()

In [25]:
anchors, positives, negatives = create_triplets(data)

In [27]:
def seq_to_tensor(seqs):
    tensors = []
    for seq in seqs:
        tensor = torch.tensor(
            [[x[0], x[1]] for x in seq],
            dtype=torch.float32
        )
        tensors.append(tensor)
    return torch.stack(tensors)

anchors_t = seq_to_tensor(anchors)
positives_t = seq_to_tensor(positives)
negatives_t = seq_to_tensor(negatives)

In [29]:
triplet_model = TripletEncoder(input_dim=2, emb_dim=EMBEDDING_DIM)
optimizer = torch.optim.Adam(triplet_model.parameters(), lr=1e-3)

In [31]:
triplet_loss = nn.TripletMarginLoss(margin=1.0, p=2)

# Модифицируем цикл обучения:
for epoch in range(3):
    total_loss = 0
    for a, p, n in zip(anchors_t, positives_t, negatives_t):
        a = a.unsqueeze(0)
        p = p.unsqueeze(0)
        n = n.unsqueeze(0)

        emb_a = triplet_model(a)
        emb_p = triplet_model(p)
        emb_n = triplet_model(n)

        loss = triplet_loss(emb_a, emb_p, emb_n)  # Теперь используется встроенный лосс
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1} | Loss: {total_loss/len(anchors_t):.4f}")

Epoch 1 | Loss: 2.6291
Epoch 2 | Loss: 1.4182


KeyboardInterrupt: 

In [None]:
user_embeddings = {}
with torch.no_grad():
    for uid in user_ids.values():
        user_seqs = [seq for seq in data if seq[0] == uid]
        embeddings = []
        for seq in user_seqs:
            coords = torch.tensor([[x[0], x[1]] for x in seq[1]], dtype=torch.float32)
            emb = triplet_model(coords.unsqueeze(0).to(device)).cpu()
            embeddings.append(emb)
        user_embeddings[uid] = torch.mean(torch.stack(embeddings), dim=0)

In [22]:
X = []
y = []
for user_id, traj in data:
    # Извлекаем все признаки из TrajectoryDataset
    inputs, targets = TrajectoryDataset([(user_id, traj)], None)[0]  # None, так как user_embeddings пока не обучены
    X.append(inputs.numpy())
    y.append(targets.numpy())

TypeError: 'NoneType' object is not subscriptable

In [17]:
X = np.array(X).squeeze()
y = np.array(y).squeeze()

In [18]:
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

In [19]:
dataset = TensorDataset(X_tensor, y_tensor)

In [20]:
train_size = int((1 - TEST_SIZE) * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [21]:
# Инициализация модели, оптимизатора и функции потерь
model = DualScaleModel(input_dim=X.shape[2], hidden_dim=LSTM_UNITS).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

AssertionError: embed_dim must be divisible by num_heads

In [None]:
for epoch in range(EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, device)
    val_loss, preds, targets = evaluate(model, test_loader, criterion, device)
    metrics = compute_metrics(preds, targets)
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | MSE: {metrics['MSE']:.4f} | ADE: {metrics['ADE']:.2f}m | FDE: {metrics['FDE']:.2f}m | <100m: {metrics['<100m %']:.2f}%")

In [None]:
# ========================
# 14. Сохранение модели
# ========================
torch.save(model.state_dict(), "dual_scale_model.pth")
print("Model saved as dual_scale_model.pth")

In [None]:
# ========================
# 15. Визуализация предсказаний
# ========================
plt.figure(figsize=(8, 6))
plt.scatter(targets[:, 1], targets[:, 0], c='blue', label='True', alpha=0.5)
plt.scatter(preds[:, 1], preds[:, 0], c='red', label='Predicted', alpha=0.5)
plt.title("Predicted vs True Trajectory Points")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend()
plt.grid(True)
plt.show()