<a href="https://colab.research.google.com/github/nvinogradskaya/Ant/blob/main/Untitled4-3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [1]:
import os
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from google.colab import drive
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Embedding, Lambda, Concatenate, RepeatVector

In [2]:
drive.mount('/content/drive')

DATA_PATH = "/content/drive/My Drive/Colab Notebooks/Data/"
SEQ_LENGTH = 10
EMBEDDING_DIM = 16
BATCH_SIZE = 64
EPOCHS = 5

Mounted at /content/drive


In [3]:
# Блок 2: Определение ContrastiveLoss
class ContrastiveLoss(tf.keras.losses.Loss):
    def __init__(self, margin=1.0):
        super().__init__()
        self.margin = margin

    def call(self, y_true, y_pred):
        y_true = tf.cast(y_true, tf.float32)
        anchor, positive = y_pred[:,0], y_pred[:,1]
        distances = tf.reduce_sum(tf.square(anchor - positive), axis=-1)
        loss = y_true * distances + (1 - y_true) * tf.maximum(self.margin - distances, 0)
        return tf.reduce_mean(loss)

In [4]:
def load_and_preprocess_data(data_path, max_users=10):
    data = []
    user_dirs = sorted(os.listdir(data_path))[:max_users]

    for user in user_dirs:
        traj_dir = os.path.join(data_path, user, 'Trajectory')
        traj_files = [f for f in os.listdir(traj_dir) if f.endswith('.plt')]

        for traj_file in traj_files:
            df = pd.read_csv(
                os.path.join(traj_dir, traj_file),
                skiprows=6,
                header=None,
                usecols=[0, 1, 3, 5, 6],
                names=['lat', 'lon', 'alt', 'date', 'time']
            )
            df['user'] = user
            data.append(df)

    df = pd.concat(data, ignore_index=True)
    df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    df.sort_values(by=['user', 'datetime'], inplace=True)

    # фильтрация и нормализация
    df = df[(df['lat'] != 0) & (df['lon'] != 0)].ffill()
    scaler = MinMaxScaler()
    df[['lat', 'lon', 'alt']] = scaler.fit_transform(df[['lat', 'lon', 'alt']])

    # временные
    df['hour_sin'] = np.sin(2 * np.pi * df['datetime'].dt.hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['datetime'].dt.hour / 24)
    df['day_sin'] = np.sin(2 * np.pi * df['datetime'].dt.dayofweek / 7)
    df['day_cos'] = np.cos(2 * np.pi * df['datetime'].dt.dayofweek / 7)

    # персонализированные
    user_ids = {user: idx for idx, user in enumerate(df['user'].unique())}
    df['user_id'] = df['user'].map(user_ids)

    return df, user_ids, scaler

In [5]:
def create_sequences(df, user_ids, seq_length):
    features = ['lat', 'lon', 'alt', 'hour_sin', 'hour_cos', 'day_sin', 'day_cos']
    targets = ['lat', 'lon']

    X, y, users = [], [], []
    for user, group in df.groupby('user'):
        user_data = group[features].values
        user_targets = group[targets].values
        user_id = user_ids[user]

        for i in range(len(user_data) - seq_length):
            X.append(user_data[i:i+seq_length])
            y.append(user_targets[i+seq_length])
            users.append(user_id)

    return np.array(X), np.array(y), np.array(users)

In [6]:
def generate_contrastive_pairs_optimized(X, users, num_negatives=2, n_jobs=4):
    unique_users = np.unique(users)
    user_indices = {u: np.where(users == u)[0] for u in unique_users}

    # Предварительный расчет всех возможных негативных пользователей
    user_pairs = {u: unique_users[unique_users != u] for u in unique_users}

    def process_user(user):
        indices = user_indices[user]
        pairs = []
        labels = []

        # Позитивные пары (векторизовано)
        if len(indices) > 1:
            pos_pairs = np.column_stack([indices[:-1], indices[1:]])
            pairs.extend(pos_pairs)
            labels.extend([1] * len(pos_pairs))

            # Негативные пары (оптимизированный выбор)
            for i, anchor in enumerate(indices[:-1]):
                neg_users = np.random.choice(
                    user_pairs[user],
                    size=num_negatives,
                    replace=len(user_pairs[user]) >= num_negatives
                )
                for neg_user in neg_users:
                    neg_idx = np.random.choice(user_indices[neg_user])
                    pairs.append([anchor, neg_idx])
                    labels.append(0)

        return np.array(pairs), np.array(labels)

    # Распараллеливание по пользователям
    results = Parallel(n_jobs=n_jobs)(delayed(process_user)(u) for u in unique_users)

    # Объединение результатов
    all_pairs = np.vstack([res[0] for res in results])
    all_labels = np.concatenate([res[1] for res in results])

    return all_pairs, all_labels

In [7]:
class ContrastiveModel(tf.keras.Model):
    def __init__(self, num_users, embedding_dim, seq_length, num_features):
        super().__init__()
        self.embedding = Embedding(num_users, embedding_dim)
        self.lstm = LSTM(32)
        self.dense = Dense(embedding_dim, activation='tanh')

    def call(self, inputs):
        seq, user_id = inputs
        user_emb = self.embedding(user_id)
        seq_features = self.lstm(seq)
        return self.dense(tf.concat([seq_features, user_emb], axis=-1))

In [8]:
df, user_ids, scaler = load_and_preprocess_data(DATA_PATH)
X, y, users = create_sequences(df, user_ids, SEQ_LENGTH)

# Генерация пар
pairs, labels = generate_contrastive_pairs_optimized(  # <-- Использовать оптимизированную версию
    X,
    users,
    num_negatives=1,  # Уменьшить число негативных примеров для ускорения
    n_jobs=8          # Использовать 8 ядер CPU
)
anchor_data = pairs[:, 0].astype(int)
pair_data = pairs[:, 1]

In [9]:
from google.colab import drive
import pickle
import os

SAVE_PATH = "/content/drive/My Drive/Colab Notebooks/contrastive_results/"
os.makedirs(SAVE_PATH, exist_ok=True)

with open(SAVE_PATH + 'contrastive_pairs_data.pkl', 'wb') as f:
    pickle.dump({
        'pairs': pairs,
        'labels': labels,
        'anchor_data': anchor_data,
        'pair_data': pair_data
    }, f)

print(f"Все данные сохранены в: {SAVE_PATH}contrastive_pairs_data.pkl")
print(f"Размеры данных:")
print(f"- pairs: {pairs.shape}")
print(f"- labels: {labels.shape}")
print(f"- anchor_data: {anchor_data.shape}")
print(f"- pair_data: {pair_data.shape}")

Все данные сохранены в: /content/drive/My Drive/Colab Notebooks/contrastive_results/contrastive_pairs_data.pkl
Размеры данных:
- pairs: (3691652, 2)
- labels: (3691652,)
- anchor_data: (3691652,)
- pair_data: (3691652,)


In [10]:
contrastive_model = ContrastiveModel(
    num_users=len(user_ids),
    embedding_dim=EMBEDDING_DIM,
    seq_length=SEQ_LENGTH,
    num_features=X.shape[-1]
)

contrastive_model.compile(
    optimizer=Adam(0.001),
    loss=ContrastiveLoss(),
    metrics=['accuracy']
)

In [15]:
anchor_data = np.array(anchor_data)
if anchor_data.ndim == 2:  # (batch, features), а нам нужно (batch, seq, features)
    anchor_data = np.expand_dims(anchor_data, axis=1)


In [17]:
print(f"anchor_data.shape: {anchor_data.shape}")
print(anchor_data[:5])  # Первые 5 элементов


anchor_data.shape: (3691652,)
[0 1 2 3 4]


In [19]:
print("Форма входных данных для LSTM:", X[anchor_data].shape)


Форма входных данных для LSTM: (3691650, 10, 7)


In [20]:
contrastive_model.fit(
    [X[anchor_data], users[anchor_data]],
    labels,
    epochs=5,
    batch_size=BATCH_SIZE
)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 3691650, 3691650
'y' sizes: 3691652
