In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, roc_auc_score

In [3]:
def extract_segmented_features(filepath, segment_duration=20.0):
    """
    Извлекает признаки из файла сессии, разделяя данные на фрагменты по segment_duration секунд.
    Возвращает список словарей с характеристиками движения для каждого фрагмента.
    """
    try:
        df = pd.read_csv(filepath)
    except (pd.errors.EmptyDataError, FileNotFoundError) as e:
        print(f"Warning: {e} for file: {filepath}")
        return None

    if df.empty or len(df) < 2:
        print(f"Warning: Not enough data in file: {filepath}")
        return None

    df = df.sort_values('record timestamp').reset_index(drop=True)
    timestamps = df['record timestamp'].values
    segments = []
    
    for start_time in np.arange(timestamps[0], timestamps[-1], segment_duration):
        end_time = start_time + segment_duration
        segment_mask = (timestamps >= start_time) & (timestamps < end_time)
        segment_df = df[segment_mask]
        
        if len(segment_df) >= 2:  # Минимум 2 точки для вычисления движения
            features = calculate_movement_features(segment_df)
            if features:
                segments.append(features)
    
    return segments if segments else None

import numpy as np
import pandas as pd
from math import atan2, sqrt, fabs

def calculate_movement_features(df):
    """Вычисляет полный набор характеристик движения, включая все параметры из изображения."""
    # Извлечение базовых данных
    x = df['x'].values
    y = df['y'].values
    timestamps = df['record timestamp'].values
    
    # Вычисление временных интервалов
    dt = np.diff(timestamps)
    dt = np.where(dt <= 0, 1e-6, dt)
    
    # 1. Перемещения и производные
    dx = np.diff(x)
    dy = np.diff(y)
    distances = np.sqrt(dx**2 + dy**2)
    speeds = distances / dt
    
    # 2. Ускорения
    if len(dt) > 1:
        accelerations = np.diff(speeds) / dt[:-1]
    else:
        accelerations = np.array([0.0])
    
    # 3. Основные характеристики траектории
    total_distance = float(np.sum(distances))
    displacement = float(np.sqrt((x[-1]-x[0])**2 + (y[-1]-y[0])**2))
    direction = np.arctan2(y[-1] - y[0], x[-1] - x[0]) if len(x) > 1 else 0.0

    # 1. Новая формула средней кривизны траектории
    curvature_sum = 0.0
    valid_curvature_points = 0
    for i in range(len(x)):
        xi, yi = x[i], y[i]
        if xi == 0 and yi == 0:
            continue
            
        # Угол между векторами (0,0)->(xi,yi) и (xi,0)->(xi,yi)
        angle = atan2(yi, xi) - atan2(yi, 0)
        distance = sqrt(xi**2 + yi**2)
        if distance > 1e-6:  # Избегаем деления на 0
            curvature_sum += fabs(angle) / distance
            valid_curvature_points += 1
    
    avg_curvature = curvature_sum / valid_curvature_points if valid_curvature_points > 0 else 0.0

    # 2. Транспортация центра масс (TCM)    
    if len(distances) > 0 and total_distance > 0:
        tcm = np.sum(timestamps[1:] * distances) / total_distance
    else:
        tcm = 0.0

    # 3. Коэффициент рассеивания (SC)
    if len(distances) > 0 and total_distance - tcm**2 > 0:
        sc = np.sum((timestamps[1:]**2) * distances) / total_distance - tcm**2
    else:
        sc = 0.0
    
    # 4. Классификация действий и вычисление направлений
    if len(dx) > 0:
        directions = np.arctan2(dy, dx)
        dir_bins = np.linspace(-np.pi, np.pi, 9)  # 8 направлений
        dir_indices = np.digitize(directions, dir_bins[:-1]) - 1
    else:
        directions = np.array([])
        dir_indices = np.array([])

    # 5. Инициализация структур для хранения статистик
    action_types = {'MM': [], 'PC': [], 'DD': []}  # Mouse Move, Point Click, Drag&Drop
    dir_speeds = {i: [] for i in range(8)}         # Скорости по направлениям
    dir_accels = {i: [] for i in range(8)}         # Ускорения по направлениям
    type_speeds = {'MM': [], 'PC': [], 'DD': []}   # Скорости по типам действий
    type_accels = {'MM': [], 'PC': [], 'DD': []}   # Ускорения по типам действий

    # 6. Заполнение статистик для каждого движения
    for i in range(len(distances)):
        if i >= len(dir_indices):  # Защита от выхода за границы
            continue
            
        direction_idx = dir_indices[i]
        current_speed = speeds[i]
        
        # Определение типа действия
        if i+1 < len(df):  # Используем следующую точку для определения состояния
            state = df['state'].iloc[i+1]
            button = df['button'].iloc[i+1]
            
            if state == 'Move' and button == 'NoButton':
                action_type = 'MM'
            elif state == 'Drag':
                action_type = 'DD'
            elif button in ['Left', 'Right']:
                action_type = 'PC'
            else:
                continue  # Пропускаем другие типы действий
        else:
            continue
        
        # Заполняем статистики по направлениям
        if 0 <= direction_idx < 8:
            dir_speeds[direction_idx].append(current_speed)
            if i < len(accelerations):
                dir_accels[direction_idx].append(accelerations[i])
        
        # Заполняем статистики по типам действий
        type_speeds[action_type].append(current_speed)
        if i < len(accelerations):
            type_accels[action_type].append(accelerations[i])

    # 7. Вычисление гистограммы направлений (MDH)
    if len(directions) > 0:
        dir_hist = np.histogram(directions, bins=dir_bins)[0]
        dir_hist = dir_hist / dir_hist.sum() if dir_hist.sum() > 0 else np.zeros(8)
    else:
        dir_hist = np.zeros(8)

    # 8. Вычисление гистограммы типов действий (ATH)
    if 'state' in df.columns and 'button' in df.columns:
        mm_count = ((df['state'] == 'Move') & (df['button'] == 'NoButton')).sum()
        pc_count = (df['button'].isin(['Left', 'Right'])).sum()
        dd_count = (df['state'] == 'Drag').sum()
        total_actions = mm_count + pc_count + dd_count
        
        ath = [
            mm_count / total_actions if total_actions > 0 else 0,
            pc_count / total_actions if total_actions > 0 else 0,
            dd_count / total_actions if total_actions > 0 else 0
        ]
    else:
        ath = [0.33, 0.33, 0.34]

    # 9. Вычисление статистик по направлениям и типам действий
    def calculate_stats(values):
        """Вычисляет min, max, mean, std для массива значений."""
        if len(values) == 0:
            return {'min': 0.0, 'max': 0.0, 'mean': 0.0, 'std': 0.0}
        return {
            'min': float(np.min(values)),
            'max': float(np.max(values)),
            'mean': float(np.mean(values)),
            'std': float(np.std(values)) if len(values) > 1 else 0.0
        }

    # Статистики скоростей и ускорений по направлениям
    mda_stats = {i: calculate_stats(dir_speeds[i]) for i in range(8)}
    aad_stats = {i: calculate_stats(dir_accels[i]) for i in range(8)}
    
    # Статистики скоростей и ускорений по типам действий
    ata_stats = {
        'MM': calculate_stats(type_speeds['MM']),
        'PC': calculate_stats(type_speeds['PC']),
        'DD': calculate_stats(type_speeds['DD'])
    }
    
    aaa_stats = {
        'MM': calculate_stats(type_accels['MM']),
        'PC': calculate_stats(type_accels['PC']),
        'DD': calculate_stats(type_accels['DD'])
    }

    # 10. Формирование результата
    features = {
        # Базовые характеристики
        'total_path_length': total_distance,
        'displacement': displacement,
#        'directness_ratio': displacement / total_distance if total_distance > 0 else 0,
        'average_curvature': float(avg_curvature),
        'mass_center_transportation': float(tcm),
        'scattering_coefficient': float(sc),
        
        # Координаты
#        'position_mean_x': float(np.mean(x)),
#        'position_mean_y': float(np.mean(y)),
#        'position_std_x': float(np.std(x)) if len(x) > 1 else 0.0,
#        'position_std_y': float(np.std(y)) if len(y) > 1 else 0.0,
        
        # Глобальные статистики скорости и ускорения
        'speed_min': float(np.min(speeds)) if len(speeds) > 0 else 0.0,
        'speed_max': float(np.max(speeds)) if len(speeds) > 0 else 0.0,
        'speed_mean': float(np.mean(speeds)) if len(speeds) > 0 else 0.0,
        'speed_std': float(np.std(speeds)) if len(speeds) > 1 else 0.0,
        
        'accel_min': float(np.min(accelerations)) if len(accelerations) > 0 else 0.0,
        'accel_max': float(np.max(accelerations)) if len(accelerations) > 0 else 0.0,
        'accel_mean': float(np.mean(accelerations)) if len(accelerations) > 0 else 0.0,
        'accel_std': float(np.std(accelerations)) if len(accelerations) > 1 else 0.0,
        
        # Гистограмма направлений (MDH)
        **{f'mdh_dir_{i}': float(dir_hist[i]) for i in range(8)},
        
        # Гистограмма типов действий (ATH)
        'ath_mouse_move': float(ath[0]),
        'ath_point_click': float(ath[1]),
        'ath_drag_drop': float(ath[2]),
        
        # Статистики скоростей по направлениям (MDA)
        **{f'mda_dir_{i}_min': mda_stats[i]['min'] for i in range(8)},
        **{f'mda_dir_{i}_max': mda_stats[i]['max'] for i in range(8)},
        **{f'mda_dir_{i}_mean': mda_stats[i]['mean'] for i in range(8)},
        **{f'mda_dir_{i}_std': mda_stats[i]['std'] for i in range(8)},
        
        # Статистики ускорений по направлениям (AAD)
        **{f'aad_dir_{i}_min': aad_stats[i]['min'] for i in range(8)},
        **{f'aad_dir_{i}_max': aad_stats[i]['max'] for i in range(8)},
        **{f'aad_dir_{i}_mean': aad_stats[i]['mean'] for i in range(8)},
        **{f'aad_dir_{i}_std': aad_stats[i]['std'] for i in range(8)},
        
        # Статистики скоростей по типам действий (ATA)
        'ata_mm_min': ata_stats['MM']['min'],
        'ata_mm_max': ata_stats['MM']['max'],
        'ata_mm_mean': ata_stats['MM']['mean'],
        'ata_mm_std': ata_stats['MM']['std'],
        
        'ata_pc_min': ata_stats['PC']['min'],
        'ata_pc_max': ata_stats['PC']['max'],
        'ata_pc_mean': ata_stats['PC']['mean'],
        'ata_pc_std': ata_stats['PC']['std'],
        
        'ata_dd_min': ata_stats['DD']['min'],
        'ata_dd_max': ata_stats['DD']['max'],
        'ata_dd_mean': ata_stats['DD']['mean'],
        'ata_dd_std': ata_stats['DD']['std'],
        
        # Статистики ускорений по типам действий (AAA)
        'aaa_mm_min': aaa_stats['MM']['min'],
        'aaa_mm_max': aaa_stats['MM']['max'],
        'aaa_mm_mean': aaa_stats['MM']['mean'],
        'aaa_mm_std': aaa_stats['MM']['std'],
        
        'aaa_pc_min': aaa_stats['PC']['min'],
        'aaa_pc_max': aaa_stats['PC']['max'],
        'aaa_pc_mean': aaa_stats['PC']['mean'],
        'aaa_pc_std': aaa_stats['PC']['std'],
        
        'aaa_dd_min': aaa_stats['DD']['min'],
        'aaa_dd_max': aaa_stats['DD']['max'],
        'aaa_dd_mean': aaa_stats['DD']['mean'],
        'aaa_dd_std': aaa_stats['DD']['std'],
    }
    
    return features

def preprocess_data(filepath):
    """
    Оптимизированная предварительная обработка данных:
    1. Удаляет полностью дублирующиеся записи
    2. Гарантирует уникальность временных меток
    3. Перезаписывает файл обработанными данными

    Оптимизации:
    - Векторизованные операции вместо итераций
    - Минимизация операций с DataFrame
    - Использование numpy для быстрых вычислений
    - Однократная сортировка данных
    """
    try:
        # Чтение с указанием dtype для ускорения загрузки
        df = pd.read_csv(filepath, dtype={
            'record timestamp': 'float64',
            'client timestamp': 'float64',
            'button': 'category',
            'state': 'category',
            'x': 'int32',
            'y': 'int32'
        })
    except (pd.errors.EmptyDataError, FileNotFoundError) as e:
        print(f"Error reading file: {e}")
        return False

    if df.empty:
        print("Warning: Empty DataFrame")
        return False

    # 1. Удаление дубликатов (быстрее чем drop_duplicates)
    unique_rows = df[~df.duplicated()]
    if len(unique_rows) < len(df):
        print(f"Removed {len(df) - len(unique_rows)} duplicate rows")
        df = unique_rows

    # 2. Обработка дубликатов временных меток
    ts_values = df['record timestamp'].values
    sorted_indices = np.argsort(ts_values)
    sorted_ts = ts_values[sorted_indices]
    
    # Находим дубликаты в отсортированном массиве
    diff = np.diff(sorted_ts)
    duplicates = np.where(diff == 0)[0]
    
    if duplicates.size > 0:
        print(f"Found {duplicates.size} duplicate timestamp groups, processing...")
        
        # Создаем маску всех дубликатов
        duplicate_mask = np.zeros_like(ts_values, dtype=bool)
        for i in duplicates:
            duplicate_mask[sorted_indices[i]] = True
            duplicate_mask[sorted_indices[i+1]] = True
        
        # Генерируем уникальные метки для дубликатов
        base_ts = sorted_ts[duplicates]
        offsets = np.arange(1, len(duplicates)+1) * 1e-6
        new_ts = base_ts + offsets
        
        # Обновляем значения в исходном массиве
        for i, idx in enumerate(duplicates):
            ts_values[sorted_indices[idx+1]] = new_ts[i]
        
        # Обновляем DataFrame одним присваиванием
        df['record timestamp'] = ts_values
        
        # Сортировка (один раз в конце)
        df = df.sort_values('record timestamp').reset_index(drop=True)

    # 3. Запись в файл с оптимизацией
    try:
        df.to_csv(filepath, index=False, float_format='%.6f')
        print(f"Successfully processed: {filepath}")
        return True
    except Exception as e:
        print(f"Error writing file: {e}")
        return False

def process_sessions_in_directory(directory, segment_duration = 5.0):
    """
    Обрабатывает все файлы сессий в указанной директории и возвращает DataFrame с признаками.
    """
    data = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.startswith("session_"): # Убеждаемся что это файл csv
            filepath = os.path.join(directory, filename)
            segments = extract_segmented_features(filepath, segment_duration)
            for segment in segments:
                data.append(segment)
                filenames.append(filename)
    if not data:  # Если data пустой, возвращаем None
        return None

    df = pd.DataFrame(data)
    df['filename'] = filenames # Добавляем column filename
    return df

def prepare_dataset(directory):
    all_training_data = []
    for user_folder in os.listdir(directory):
        user_path = os.path.join(directory, user_folder)
        training_data = process_sessions_in_directory(user_path)
        all_training_data.append(training_data)
    if all_training_data:
        training_data = pd.concat(all_training_data, ignore_index=True)
        print("Shape of combined training data:", training_data.shape)  # Проверяем размерность
    else:
        print("No training data found. Check directory structure.")
        exit()
    return training_data

In [6]:
training_data = prepare_dataset('kek_train')
feature_names = training_data.columns.drop('filename') # Исключаем 'filename'
print(feature_names)

Shape of combined training data: (3182, 113)
Index(['total_path_length', 'displacement', 'average_curvature',
       'mass_center_transportation', 'scattering_coefficient', 'speed_min',
       'speed_max', 'speed_mean', 'speed_std', 'accel_min',
       ...
       'aaa_mm_mean', 'aaa_mm_std', 'aaa_pc_min', 'aaa_pc_max', 'aaa_pc_mean',
       'aaa_pc_std', 'aaa_dd_min', 'aaa_dd_max', 'aaa_dd_mean', 'aaa_dd_std'],
      dtype='object', length=112)


In [7]:
for name in feature_names:
    print(name)

total_path_length
displacement
average_curvature
mass_center_transportation
scattering_coefficient
speed_min
speed_max
speed_mean
speed_std
accel_min
accel_max
accel_mean
accel_std
mdh_dir_0
mdh_dir_1
mdh_dir_2
mdh_dir_3
mdh_dir_4
mdh_dir_5
mdh_dir_6
mdh_dir_7
ath_mouse_move
ath_point_click
ath_drag_drop
mda_dir_0_min
mda_dir_1_min
mda_dir_2_min
mda_dir_3_min
mda_dir_4_min
mda_dir_5_min
mda_dir_6_min
mda_dir_7_min
mda_dir_0_max
mda_dir_1_max
mda_dir_2_max
mda_dir_3_max
mda_dir_4_max
mda_dir_5_max
mda_dir_6_max
mda_dir_7_max
mda_dir_0_mean
mda_dir_1_mean
mda_dir_2_mean
mda_dir_3_mean
mda_dir_4_mean
mda_dir_5_mean
mda_dir_6_mean
mda_dir_7_mean
mda_dir_0_std
mda_dir_1_std
mda_dir_2_std
mda_dir_3_std
mda_dir_4_std
mda_dir_5_std
mda_dir_6_std
mda_dir_7_std
aad_dir_0_min
aad_dir_1_min
aad_dir_2_min
aad_dir_3_min
aad_dir_4_min
aad_dir_5_min
aad_dir_6_min
aad_dir_7_min
aad_dir_0_max
aad_dir_1_max
aad_dir_2_max
aad_dir_3_max
aad_dir_4_max
aad_dir_5_max
aad_dir_6_max
aad_dir_7_max
aad_dir_0_mean

In [8]:
print(training_data)

      total_path_length  displacement  average_curvature  \
0           1453.470824    273.673163           0.001934   
1            299.345261     48.052055           0.002316   
2            721.565399    177.589414           0.003675   
3          93601.085127  92325.553397           0.001973   
4           1416.459575    387.552577           0.002523   
...                 ...           ...                ...   
3177          41.850404      2.236068           0.001463   
3178        6540.403952    306.752343           0.002512   
3179        1627.005708    372.823014           0.001633   
3180        4652.499185    772.769694           0.001918   
3181         123.422597     33.241540           0.001541   

      mass_center_transportation  scattering_coefficient   speed_min  \
0                       0.705892            2.874672e-01    0.000000   
1                       7.766601            6.622196e-02    0.000000   
2                      13.721137            7.857921e-09  237.8

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(training_data[feature_names])
print(X_train)
input_dim = X_train.shape[1]
print(input_dim)

[[-0.10807157 -0.06003033  0.2143638  ... -0.14235095  0.21716518
  -0.1739552 ]
 [-0.27967126 -0.10000192  0.54312028 ... -0.14235095  0.21716518
  -0.1739552 ]
 [-0.216894   -0.07705276  1.71279973 ... -0.14235095  0.21716518
  -0.1739552 ]
 ...
 [-0.08226976 -0.04246469 -0.04471772 ... -0.14235095  0.21716518
  -0.1739552 ]
 [ 0.36757189  0.02839085  0.19995197 ... -0.14235095  0.21716518
  -0.1739552 ]
 [-0.3058281  -0.10262579 -0.12415583 ... -0.14235095  0.21716518
  -0.1739552 ]]
112


In [10]:
def create_autoencoder(input_dim):
    """Создает модель автокодировщика."""
    input_layer = Input(shape=(input_dim))
    # Encoder
    encoder = Dense(input_dim, activation='elu')(input_layer)
    encoder = Dropout(0.3)(encoder)
    encoder = Dense(100, activation='elu')(encoder)
    encoder = Dropout(0.2)(encoder)
    encoder = Dense(70, activation='elu')(encoder)
    encoder = Dropout(0.1)(encoder)
    #encoder = Dense(40, activation='elu')(encoder)
    encoded = Dense(40, activation='elu')(encoder)  # Сжатый вектор признаков

    # Decoder
    #decoder = Dense(40, activation='elu')(encoded)
    decoder = Dense(70, activation='elu')(encoded)
    decoder = Dense(100, activation='elu')(decoder)
    decoder = Dense(input_dim, activation='elu')(decoder)
    decoded = Dense(input_dim, activation='elu')(decoder)  # Восстановленный ввод

    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [11]:
autoencoder = create_autoencoder(input_dim)
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 112)]             0         
                                                                 
 dense (Dense)               (None, 112)               12656     
                                                                 
 dropout (Dropout)           (None, 112)               0         
                                                                 
 dense_1 (Dense)             (None, 100)               11300     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 70)                7070      
                                                                 
 dropout_2 (Dropout)         (None, 70)                0     

In [215]:
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='huber', metrics=['mse'])
#autoencoder.fit(X_train, X_train, epochs=5, batch_size=32, shuffle=False, verbose=1)

In [4]:
import telebot
TOKEN = '5724705081:AAF78TW5FNolA_e-t2PhiTR4i_1dHV8Ak0k' 
CHAT_ID = '968613418'
# -------------------------------

bot = telebot.TeleBot(TOKEN)

def send_completion_message(message):
  try:
    bot.send_message(CHAT_ID, message)
    print("Сообщение отправлено в Telegram.")
  except Exception as e:
    print(f"Ошибка при отправке сообщения в Telegram: {e}")

In [16]:
train_directory = 'training_files'
test_directory = 'test_files'
public_labels_file = 'public_labels.csv'
public_labels = pd.read_csv(public_labels_file, index_col='filename')
segment_duration = np.arange(600, 930, 30)
mean_auc = []
for duration in segment_duration:
    message = f'Processing duration = {duration}.....'
    send_completion_message(message)
    history = []
    for user_folder in os.listdir(train_directory):
        train_path = os.path.join(train_directory, user_folder)
        print(f"Processing {train_path}")
        training_data = process_sessions_in_directory(train_path, duration)
        feature_names = training_data.columns.drop('filename')
        scaler = StandardScaler()
        X_train = scaler.fit_transform(training_data[feature_names])
        print(X_train, X_train.shape)
        
        all_test_data = []
        test_filenames = []
        test_path = os.path.join(test_directory, user_folder)
        print(f"Processing {test_path}")
        for filename in os.listdir(test_path):
            if filename.startswith("session_"): # Убеждаемся что это файл csv
                filepath = os.path.join(test_path, filename)
                segments = extract_segmented_features(filepath, duration)
                for segment in segments:
                    all_test_data.append(segment)
                    test_filenames.append(filename)
        test_data = pd.DataFrame(all_test_data)
        test_data['filename'] = test_filenames # Добавляем столбец filename
        test_data_scaled = scaler.transform(test_data[feature_names])
        test_data_scaled = pd.DataFrame(test_data_scaled, columns=feature_names)
        # 4. Добавление меток из public_labels и удаление данных без меток
        test_data_scaled['is_illegal'] = 0  # Default: not illegal
        valid_indices = []
        for index, filename in enumerate(test_data['filename']):
            if filename in public_labels.index:
                test_data_scaled.loc[index, 'is_illegal'] = public_labels.loc[filename, 'is_illegal']
                valid_indices.append(index)
        test_data_scaled = test_data_scaled.iloc[valid_indices].reset_index(drop=True)
        test_data_filenames = test_data['filename'].iloc[valid_indices].reset_index(drop=True), public_labels
        test_labels = test_data_scaled['is_illegal'].values
        print(test_data_scaled, test_data_scaled.shape)
    
        input_dim = X_train.shape[1]
        autoencoder = create_autoencoder(input_dim)
        hist = []
        print('Learning...')
        autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
        for i in range(5):
            autoencoder.fit(X_train, X_train, epochs=1, batch_size=32, shuffle=True, verbose=0)     
            X_test = test_data_scaled[feature_names].values
            encoded_test = autoencoder.predict(X_test, verbose=0)
            mse = np.mean(np.power(X_test - encoded_test, 2), axis=1)
            #print(mse)
            #print(np.mean(mse))
            #print(np.max(mse))
            auc_roc = roc_auc_score(test_labels, mse)
            hist.append(auc_roc)
            print(f"    epoch: {i}", f"AUC-ROC: {auc_roc}", f"mse: {np.mean(mse)}")
        history.append(np.max(hist))
        print(np.max(hist))
        print(np.argmax(hist))
        message = f'User: {user_folder} Max AUC: {np.max(hist)}'
        send_completion_message(message)
    message = f'Mean AUC: {np.mean(history)}\n____________________________________'
    send_completion_message(message)
    mean_auc.append(np.mean(history))
message = f'Results\nSegments: {segment_duration}\nAUC: {mean_auc}\nFinished!'
send_completion_message(message)

Сообщение отправлено в Telegram.
Processing training_files\user12
[[-0.07092512 -0.1427431  -0.60825598 ...  4.66682398 -0.92388201
   1.14532776]
 [-0.44276446 -0.0934345  -0.37951953 ... -0.1373705   0.37706225
  -0.41455294]
 [ 0.09124719 -0.09674576 -0.54714444 ... -0.13445515  0.29157211
  -0.2503311 ]
 ...
 [ 1.69228373 -0.0386678  -1.32730908 ... -0.13708745  0.35644484
  -0.36345419]
 [ 1.13514351 -0.03895618 -0.95755847 ... -0.1373708   0.37707663
  -0.41456555]
 [-1.48703046 -0.11617719 -0.53000843 ... -0.13737087  0.37707724
  -0.41456607]] (178, 112)
Processing test_files\user12
     total_path_length  displacement  average_curvature  \
0            -0.674078     -0.085928           2.600509   
1            -1.304967     -0.042126          -0.179251   
2            -0.640846     -0.092389           3.122124   
3            -1.524380     -0.111064           4.448621   
4            -1.109401     -0.063154           1.272577   
..                 ...           ...            

In [7]:
print(np.mean(history))

0.5266760970096505


In [15]:
print(float(np.arange(600, 900, 30)))

TypeError: only length-1 arrays can be converted to Python scalars

In [216]:
hist = []
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
for i in range(100):
    autoencoder.fit(X_train, X_train, epochs=1, batch_size=32, shuffle=True, verbose=0)
    X_test = test_data_scaled[feature_names].values
    encoded_test = autoencoder.predict(X_test, verbose=0)
    mse = np.mean(np.power(X_test - encoded_test, 2), axis=1)
    #print(mse)
    #print(np.mean(mse))Z
    #print(np.max(mse))
    auc_roc = roc_auc_score(test_labels, mse)
    hist.append(auc_roc)
    print(f"epoch: {i}", f"AUC-ROC: {auc_roc}", f"mse: {np.mean(mse)}")

epoch: 0 AUC-ROC: 0.9128863910076275 mse: 89739280.03246246
epoch: 1 AUC-ROC: 0.9126856684062625 mse: 89512984.38186438
epoch: 2 AUC-ROC: 0.9122842232035328 mse: 89417129.94154362
epoch: 3 AUC-ROC: 0.9122842232035328 mse: 89418560.9374742
epoch: 4 AUC-ROC: 0.9120835006021678 mse: 89432718.41071698
epoch: 5 AUC-ROC: 0.9124849458048977 mse: 89401107.73327722
epoch: 6 AUC-ROC: 0.9126856684062625 mse: 89410789.2101703
epoch: 7 AUC-ROC: 0.9128863910076275 mse: 89305123.79182239
epoch: 8 AUC-ROC: 0.9128863910076275 mse: 89027834.81663172
epoch: 9 AUC-ROC: 0.9128863910076275 mse: 88956620.77748324
epoch: 10 AUC-ROC: 0.9128863910076275 mse: 88905549.33637103
epoch: 11 AUC-ROC: 0.9130871136089924 mse: 88894224.27400547
epoch: 12 AUC-ROC: 0.9130871136089924 mse: 88829914.77786809
epoch: 13 AUC-ROC: 0.9130871136089924 mse: 88760974.92668992
epoch: 14 AUC-ROC: 0.9132878362103574 mse: 88741744.43674575
epoch: 15 AUC-ROC: 0.9132878362103574 mse: 88737661.42410414
epoch: 16 AUC-ROC: 0.913488558811722

KeyboardInterrupt: 

In [199]:
print(np.max(hist))
print(np.argmax(hist))

0.9416720154043645
77


In [200]:
X_encoded = autoencoder.predict(X_train, verbose=1)
mse = np.mean(np.power(X_train - X_encoded, 2), axis=1)
print(mse)
print(np.mean(mse))
print(np.max(mse))

[0.19566618 0.22086983 0.12885637 0.23770792 0.24412217 0.07642885
 0.17189401 0.62770112 0.2204488  0.06764102 0.09596965 0.10379114
 0.22163029 0.12507322 0.06804909 0.13995673 0.12036735 0.20059925
 0.39653555 1.00629511 0.11173583 0.14371708 0.20869847 0.08636454
 0.27195431 0.14610038 0.15683826 0.08461471 0.14095925 0.07060499
 0.27171208 2.16792933 0.13280062 0.36306722 0.24582617 1.08274608
 0.05853648 0.06906507 0.28448057 0.06674617 0.11691544 0.10618392
 0.20252695 0.2078216  0.10184998 0.05815692 0.12229261 0.19696143
 0.13446107 0.1624792  0.18435863 0.14512096 0.18131071 0.14123151
 0.07636561 0.17871929 0.10131363 0.10180986 0.28975819 0.13617761
 0.14374022 0.09909409 0.22563217 0.17496674 0.15504127 0.24624993
 0.13616707 0.16464598 0.18026933 0.15348448 0.18630683 0.14354473
 0.11630771 0.16843715 0.13606663 0.1492367  0.14483658 0.28316316
 0.26694252 0.10158252 0.07976228 0.31368976 0.13759126 0.12835148
 1.23994798 0.15027306 0.0946837  0.19078879 0.22319618 0.1534

In [201]:
#60сек input/100/70/40 lr=0.001 loss='mse'epoch=100
X_test = test_data_scaled[feature_names].values
encoded_test = autoencoder.predict(X_test, verbose=1)
mse = np.mean(np.power(X_test - encoded_test, 2), axis=1)
print(mse)
print(np.mean(mse))
print(np.max(mse))
auc_roc = roc_auc_score(test_labels, mse)
print(f"AUC-ROC: {auc_roc}")

[1.76678985e+00 1.33650397e+07 2.28341848e+00 1.21258986e+00
 1.77682637e+00 1.33963486e+00 7.68841221e-01 1.97756569e+00
 1.98961166e+00 5.82201310e-01 9.54586499e-02 1.98553778e-01
 3.10718581e-01 1.22063137e-01 3.63730900e-01 2.61393866e-01
 1.13536796e-01 3.28439370e-01 2.98873520e-01 7.76442940e-02
 2.48871551e+00 2.81344864e+00 1.62706266e+00 1.85697496e+00
 3.00937311e+00 1.57323707e+00 1.78763764e+00 1.63184425e+00
 2.23450251e+00 6.07367195e-01 1.57515277e+00 1.63597237e-01
 7.80836157e-02 1.33548422e-01 1.70801926e-01 1.70855314e-01
 1.71522684e+00 9.48443813e-01 1.39422913e+00 4.24182447e+00
 8.42622461e-01 1.18160059e+00 9.10066448e+00 2.69642853e+01
 7.76224639e+00 4.63172607e+00 3.20194359e-01 2.17419415e-01
 3.91258468e-01 3.66861602e-01 4.57134722e-01 1.30277971e+00
 2.89321881e-01 3.52027467e-01 2.76932522e-01 1.78050054e-01
 3.28752681e-01 3.00546122e-01 2.91845182e-01 2.31677858e-01
 6.93984151e-01 1.02680329e+00 1.21701971e+00 2.11283672e+00
 1.20118401e+00 8.299785

In [14]:
test_data = prepare_dataset('kek_test')
X_test = scaler.transform(test_data[feature_names])

Shape of combined training data: (4062, 110)


In [15]:
print(test_data)

      total_path_length  displacement   speed_min     speed_max    speed_mean  \
0           5740.056973    192.520129    0.000000  1.603122e+08  1.405292e+07   
1           2882.156085    290.833286    0.000000  1.090183e+08  1.316400e+07   
2            543.576790    220.966061  603.203474  8.982205e+07  2.470680e+07   
3           1270.606923    231.008658    0.000000  1.037545e+08  8.390005e+06   
4           1409.746352     78.771822    0.000000  1.159741e+08  7.431056e+06   
...                 ...           ...         ...           ...           ...   
4057        2907.960543     35.510562    0.000000  1.382931e+08  1.870203e+07   
4058        1294.070098    597.800134    0.000000  7.566370e+07  1.058391e+07   
4059        1808.960314     95.131488    3.742963  6.888400e+07  7.889970e+06   
4060         182.902463     59.033889    0.000000  1.627882e+07  2.124507e+06   
4061        2228.859036    357.337376    0.000000  9.953890e+07  8.558014e+06   

         speed_std     acce

In [309]:
X_encoded = autoencoder.predict(X_test, verbose=1)
mse = np.mean(np.power(X_test - X_encoded, 2), axis=1)
print(mse)
print(np.mean(mse))
print(np.max(mse))

[1.16960315 0.34646176 2.0007533  ... 1.78019035 0.26465391 0.26529   ]
3.4499022543661098
4463.656056801737


In [130]:
def prepare_test_data(test_dir, public_labels_file, scaler, feature_names):
    """
    Подготавливает тестовые данные: извлекает признаки, масштабирует и добавляет метки из public_labels.
    """
    # 1. Загрузка меток из public_labels
    public_labels = pd.read_csv(public_labels_file, index_col='filename')

    # 2. Извлечение признаков из файлов в test_dir
    all_test_data = []
    test_filenames = []
    for user_dir in os.listdir(test_dir):
        user_path = os.path.join(test_dir, user_dir)
        for filename in os.listdir(user_path):
            if filename.startswith("session_"): # Убеждаемся что это файл csv
                filepath = os.path.join(user_path, filename)
                segments = extract_segmented_features(filepath)
                for segment in segments:
                    all_test_data.append(segment)
                    test_filenames.append(filename)

    if not all_test_data:
        print("No test data found. Check directory structure.")
        return None, None, None # Возвращаем None если нет данных

    test_data = pd.DataFrame(all_test_data)
    test_data['filename'] = test_filenames # Добавляем столбец filename

    # 3. Масштабирование тестовых данных
    test_data_scaled = scaler.transform(test_data[feature_names])
    test_data_scaled = pd.DataFrame(test_data_scaled, columns=feature_names)

    # 4. Добавление меток из public_labels
    test_data_scaled['is_illegal'] = 0  # Default: not illegal
    for filename in test_data['filename']:
        if filename in public_labels.index:
            test_data_scaled.loc[test_data['filename'] == filename, 'is_illegal'] = public_labels.loc[filename, 'is_illegal']

    return test_data_scaled, test_data['filename'], public_labels

In [159]:
def prepare_test_data(test_dir, public_labels_file, scaler, feature_names):
    """
    Подготавливает тестовые данные: извлекает признаки, масштабирует и добавляет метки из public_labels.
    """
    # 1. Загрузка меток из public_labels
    public_labels = pd.read_csv(public_labels_file, index_col='filename')

    # 2. Извлечение признаков из файлов в test_dir
    all_test_data = []
    test_filenames = []
    for user_dir in os.listdir(test_dir):
        user_path = os.path.join(test_dir, user_dir)
        for filename in os.listdir(user_path):
            if filename.startswith("session_"): # Убеждаемся что это файл csv
                filepath = os.path.join(user_path, filename)
                segments = extract_segmented_features(filepath)
                for segment in segments:
                    all_test_data.append(segment)
                    test_filenames.append(filename)

    if not all_test_data:
        print("No test data found. Check directory structure.")
        return None, None, None # Возвращаем None если нет данных

    test_data = pd.DataFrame(all_test_data)
    test_data['filename'] = test_filenames # Добавляем столбец filename

    # 3. Масштабирование тестовых данных
    test_data_scaled = scaler.transform(test_data[feature_names])
    test_data_scaled = pd.DataFrame(test_data_scaled, columns=feature_names)

    # 4. Добавление меток из public_labels и удаление данных без меток
    test_data_scaled['is_illegal'] = 0  # Default: not illegal
    valid_indices = []
    for index, filename in enumerate(test_data['filename']):
        if filename in public_labels.index:
            test_data_scaled.loc[index, 'is_illegal'] = public_labels.loc[filename, 'is_illegal']
            valid_indices.append(index)
    
    test_data_scaled = test_data_scaled.iloc[valid_indices].reset_index(drop=True)
    
    return test_data_scaled, test_data['filename'].iloc[valid_indices].reset_index(drop=True), public_labels

In [207]:
test_data_scaled, test_data_filenames, public_labels = prepare_test_data('kek_test', 'public_labels.csv', scaler, feature_names)

In [208]:
print(test_data_scaled)

     total_path_length  displacement  average_curvature  \
0            -0.732231     -0.087650           3.780391   
1            -0.760800     -0.045374          -2.464399   
2            -0.648156     -0.065788          -0.786546   
3            -0.620441     -0.126138          -0.877004   
4            -0.654066     -0.103525          -0.495441   
..                 ...           ...                ...   
148          -0.715796     -0.086655          -2.045524   
149          -0.674701     -0.134940          -0.155325   
150          -0.162538     -0.111228          -0.510927   
151          -0.586066     -0.025338          -0.910128   
152          -0.523039     -0.060106          -1.322817   

     mass_center_transportation  scattering_coefficient  speed_min  speed_max  \
0                     -1.547271               -0.238354  -0.098058  -1.734629   
1                      3.285306               -0.284347  -0.098058  -2.033739   
2                     -1.486488                0

In [209]:
print(test_data_filenames)

0      session_0233596484
1      session_0233596484
2      session_0249395771
3      session_0249395771
4      session_0249395771
              ...        
148    session_9904426178
149    session_9904426178
150    session_9916663391
151    session_9916663391
152    session_9916663391
Name: filename, Length: 153, dtype: object


In [210]:
print(public_labels)

                    is_illegal
filename                      
session_0003960194           1
session_0005840196           0
session_0025450757           0
session_0029922803           0
session_0064281061           1
...                        ...
session_9938110038           0
session_9951071945           1
session_9956793065           0
session_9973193301           0
session_9983042278           1

[816 rows x 1 columns]


In [213]:
X_test = test_data_scaled[feature_names].values
encoded_test = autoencoder.predict(X_test, verbose=1)
mse = np.mean(np.power(X_test - encoded_test, 2), axis=1)
print(mse)
print(np.mean(mse))
print(np.max(mse))

[8.79260918e+00 7.16980130e+00 4.30050566e+00 9.23803528e+00
 6.00087645e+00 6.93674000e+00 4.81131232e-01 8.33225858e-01
 1.40480594e+00 5.12001601e-01 6.00764330e-01 4.38099294e-01
 5.00930396e-01 7.35160866e+00 5.59891940e+00 7.73448944e+00
 4.85098277e+00 3.04078716e+01 1.12255396e+00 8.87425525e+00
 4.94315641e-01 8.96615301e-01 1.06339110e+00 4.25623076e+00
 2.87937019e+02 1.36876937e+02 4.45512685e+00 6.90509108e+00
 4.91412951e+01 2.61651197e+01 6.79426795e-01 6.54854953e-01
 1.79708755e+00 1.23500875e+00 5.77908029e-01 4.83939752e+01
 5.24554525e-01 1.20522979e+00 3.52653301e+00 3.45924920e+01
 6.03636093e+00 6.15703136e+00 7.59255210e+00 4.33942780e+00
 4.75590347e+00 1.56188062e+01 4.93038128e+00 9.56563918e-01
 2.27929721e+01 4.96821618e+00 4.29465650e+00 6.08871456e+00
 1.54194711e+08 3.91360240e+00 1.59493606e+01 1.73007746e+04
 5.99682315e+00 5.92674232e+00 8.83461712e+01 4.58657544e+00
 4.33098952e+00 5.78827668e+00 5.55099396e+00 8.67218293e+00
 5.24440980e+01 6.064868

In [211]:
test_labels = test_data_scaled['is_illegal'].values
print(test_labels)

[1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0 0
 0 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 0 1 1 0 0 0
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
 0 0 1 1 1]


In [214]:
auc_roc = roc_auc_score(test_labels, mse)
print(f"AUC-ROC: {auc_roc}")

AUC-ROC: 0.9128863910076275


In [181]:
results = pd.DataFrame({'filename': test_data_filenames, 'mse': mse, 'is_illegal': test_labels})
print("\nResults (MSE and is_illegal for each test session):")
print(results)


Results (MSE and is_illegal for each test session):
               filename           mse  is_illegal
0    session_0233596484  7.819067e-01           1
1    session_0233596484  1.142483e+06           1
2    session_0233596484  5.334132e-01           1
3    session_0249395771  8.776108e-01           1
4    session_0249395771  6.220642e-01           1
..                  ...           ...         ...
393  session_9916663391  3.095743e-01           1
394  session_9916663391  4.014653e-01           1
395  session_9916663391  2.050492e+01           1
396  session_9916663391  4.331625e-01           1
397  session_9916663391  3.325061e+03           1

[398 rows x 3 columns]


In [6]:
directory = 'training_files'
for user_dir in os.listdir(directory):
    user_path = os.path.join(directory, user_dir)
    for filename in os.listdir(user_path):
        if filename.startswith("session_"):
            filepath = os.path.join(user_path, filename)
            preprocess_data(filepath)

Removed 8 duplicate rows
Found 4625 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_2144641057
Removed 8 duplicate rows
Found 4251 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_5265929106
Removed 13 duplicate rows
Found 5354 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_5815391283
Removed 153 duplicate rows
Found 10792 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_7409188284
Removed 12 duplicate rows
Found 5460 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_8872593360
Removed 36 duplicate rows
Found 4313 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_9031593624
Removed 17 duplicate rows
Found 3492 duplicate timestamp groups, processing...
Successfully processed: training_files\user12\session_983

In [7]:
directory = 'test_files'
for user_dir in os.listdir(directory):
    user_path = os.path.join(directory, user_dir)
    for filename in os.listdir(user_path):
        if filename.startswith("session_"):
            filepath = os.path.join(user_path, filename)
            preprocess_data(filepath)

Found 339 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0032069206
Found 245 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0126772600
Found 66 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0166199610
Found 287 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0170625567
Found 73 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0172860263
Found 68 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0172989910
Found 271 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0184498835
Found 328 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\session_0195566274
Found 45 duplicate timestamp groups, processing...
Successfully processed: test_files\user12\sessio