In [2]:
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, UpSampling1D, Flatten, Dense, Reshape
from tensorflow.keras.models import Model

def create_convolutional_autoencoder(input_dim=112):
    """Создает сверточный автокодировщик с выходом формы (None, 112)"""
    input_layer = Input(shape=(input_dim, 1))  # [batch, 112, 1]
    
    # Encoder
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(input_layer)
    x = MaxPooling1D(2, padding='same')(x)  # [batch, 56, 64]
    
    x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
    x = MaxPooling1D(2, padding='same')(x)  # [batch, 28, 32]
    
    x = Conv1D(16, kernel_size=3, activation='relu', padding='same')(x)
    encoded = MaxPooling1D(2, padding='same')(x)  # [batch, 14, 16]
    
    # Decoder
    x = Conv1D(16, kernel_size=3, activation='relu', padding='same')(encoded)
    x = UpSampling1D(2)(x)  # [batch, 28, 16]
    
    x = Conv1D(32, kernel_size=3, activation='relu', padding='same')(x)
    x = UpSampling1D(2)(x)  # [batch, 56, 32]
    
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same')(x)
    x = UpSampling1D(2)(x)  # [batch, 112, 64]
    
    # Преобразуем в нужную форму выхода
    x = Conv1D(1, kernel_size=3, activation='sigmoid', padding='same')(x)  # [batch, 112, 1]
    decoded = Reshape((input_dim,))(x)  # Удаляем последнюю размерность -> [batch, 112]
    
    autoencoder = Model(input_layer, decoded)
    return autoencoder

In [3]:
# Пример использования
autoencoder = create_convolutional_autoencoder(112)
autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 112, 1)]          0         
                                                                 
 conv1d (Conv1D)             (None, 112, 64)           256       
                                                                 
 max_pooling1d (MaxPooling1D  (None, 56, 64)           0         
 )                                                               
                                                                 
 conv1d_1 (Conv1D)           (None, 56, 32)            6176      
                                                                 
 max_pooling1d_1 (MaxPooling  (None, 28, 32)           0         
 1D)                                                             
                                                                 
 conv1d_2 (Conv1D)           (None, 28, 16)            1552  

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error, roc_auc_score

In [4]:
def extract_segmented_features(filepath, segment_duration=20.0):
    """
    Извлекает признаки из файла сессии, разделяя данные на фрагменты по segment_duration секунд.
    Возвращает список словарей с характеристиками движения для каждого фрагмента.
    """
    try:
        df = pd.read_csv(filepath)
    except (pd.errors.EmptyDataError, FileNotFoundError) as e:
        print(f"Warning: {e} for file: {filepath}")
        return None

    if df.empty or len(df) < 2:
        print(f"Warning: Not enough data in file: {filepath}")
        return None

    df = df.sort_values('record timestamp').reset_index(drop=True)
    timestamps = df['record timestamp'].values
    segments = []
    
    for start_time in np.arange(timestamps[0], timestamps[-1], segment_duration):
        end_time = start_time + segment_duration
        segment_mask = (timestamps >= start_time) & (timestamps < end_time)
        segment_df = df[segment_mask]
        
        if len(segment_df) >= 2:  # Минимум 2 точки для вычисления движения
            features = calculate_movement_features(segment_df)
            if features:
                segments.append(features)
    
    return segments if segments else None

import numpy as np
import pandas as pd
from math import atan2, sqrt, fabs

def calculate_movement_features(df):
    """Вычисляет полный набор характеристик движения, включая все параметры из изображения."""
    # Извлечение базовых данных
    x = df['x'].values
    y = df['y'].values
    timestamps = df['record timestamp'].values
    
    # Вычисление временных интервалов
    dt = np.diff(timestamps)
    dt = np.where(dt <= 0, 1e-6, dt)
    
    # 1. Перемещения и производные
    dx = np.diff(x)
    dy = np.diff(y)
    distances = np.sqrt(dx**2 + dy**2)
    speeds = distances / dt
    
    # 2. Ускорения
    if len(dt) > 1:
        accelerations = np.diff(speeds) / dt[:-1]
    else:
        accelerations = np.array([0.0])
    
    # 3. Основные характеристики траектории
    total_distance = float(np.sum(distances))
    displacement = float(np.sqrt((x[-1]-x[0])**2 + (y[-1]-y[0])**2))
    direction = np.arctan2(y[-1] - y[0], x[-1] - x[0]) if len(x) > 1 else 0.0

    # 1. Новая формула средней кривизны траектории
    curvature_sum = 0.0
    valid_curvature_points = 0
    for i in range(len(x)):
        xi, yi = x[i], y[i]
        if xi == 0 and yi == 0:
            continue
            
        # Угол между векторами (0,0)->(xi,yi) и (xi,0)->(xi,yi)
        angle = atan2(yi, xi) - atan2(yi, 0)
        distance = sqrt(xi**2 + yi**2)
        if distance > 1e-6:  # Избегаем деления на 0
            curvature_sum += fabs(angle) / distance
            valid_curvature_points += 1
    
    avg_curvature = curvature_sum / valid_curvature_points if valid_curvature_points > 0 else 0.0

    # 2. Транспортация центра масс (TCM)    
    if len(distances) > 0 and total_distance > 0:
        tcm = np.sum(timestamps[1:] * distances) / total_distance
    else:
        tcm = 0.0

    # 3. Коэффициент рассеивания (SC)
    if len(distances) > 0 and total_distance - tcm**2 > 0:
        sc = np.sum((timestamps[1:]**2) * distances) / total_distance - tcm**2
    else:
        sc = 0.0
    
    # 4. Классификация действий и вычисление направлений
    if len(dx) > 0:
        directions = np.arctan2(dy, dx)
        dir_bins = np.linspace(-np.pi, np.pi, 9)  # 8 направлений
        dir_indices = np.digitize(directions, dir_bins[:-1]) - 1
    else:
        directions = np.array([])
        dir_indices = np.array([])

    # 5. Инициализация структур для хранения статистик
    action_types = {'MM': [], 'PC': [], 'DD': []}  # Mouse Move, Point Click, Drag&Drop
    dir_speeds = {i: [] for i in range(8)}         # Скорости по направлениям
    dir_accels = {i: [] for i in range(8)}         # Ускорения по направлениям
    type_speeds = {'MM': [], 'PC': [], 'DD': []}   # Скорости по типам действий
    type_accels = {'MM': [], 'PC': [], 'DD': []}   # Ускорения по типам действий

    # 6. Заполнение статистик для каждого движения
    for i in range(len(distances)):
        if i >= len(dir_indices):  # Защита от выхода за границы
            continue
            
        direction_idx = dir_indices[i]
        current_speed = speeds[i]
        
        # Определение типа действия
        if i+1 < len(df):  # Используем следующую точку для определения состояния
            state = df['state'].iloc[i+1]
            button = df['button'].iloc[i+1]
            
            if state == 'Move' and button == 'NoButton':
                action_type = 'MM'
            elif state == 'Drag':
                action_type = 'DD'
            elif button in ['Left', 'Right']:
                action_type = 'PC'
            else:
                continue  # Пропускаем другие типы действий
        else:
            continue
        
        # Заполняем статистики по направлениям
        if 0 <= direction_idx < 8:
            dir_speeds[direction_idx].append(current_speed)
            if i < len(accelerations):
                dir_accels[direction_idx].append(accelerations[i])
        
        # Заполняем статистики по типам действий
        type_speeds[action_type].append(current_speed)
        if i < len(accelerations):
            type_accels[action_type].append(accelerations[i])

    # 7. Вычисление гистограммы направлений (MDH)
    if len(directions) > 0:
        dir_hist = np.histogram(directions, bins=dir_bins)[0]
        dir_hist = dir_hist / dir_hist.sum() if dir_hist.sum() > 0 else np.zeros(8)
    else:
        dir_hist = np.zeros(8)

    # 8. Вычисление гистограммы типов действий (ATH)
    if 'state' in df.columns and 'button' in df.columns:
        mm_count = ((df['state'] == 'Move') & (df['button'] == 'NoButton')).sum()
        pc_count = (df['button'].isin(['Left', 'Right'])).sum()
        dd_count = (df['state'] == 'Drag').sum()
        total_actions = mm_count + pc_count + dd_count
        
        ath = [
            mm_count / total_actions if total_actions > 0 else 0,
            pc_count / total_actions if total_actions > 0 else 0,
            dd_count / total_actions if total_actions > 0 else 0
        ]
    else:
        ath = [0.33, 0.33, 0.34]

    # 9. Вычисление статистик по направлениям и типам действий
    def calculate_stats(values):
        """Вычисляет min, max, mean, std для массива значений."""
        if len(values) == 0:
            return {'min': 0.0, 'max': 0.0, 'mean': 0.0, 'std': 0.0}
        return {
            'min': float(np.min(values)),
            'max': float(np.max(values)),
            'mean': float(np.mean(values)),
            'std': float(np.std(values)) if len(values) > 1 else 0.0
        }

    # Статистики скоростей и ускорений по направлениям
    mda_stats = {i: calculate_stats(dir_speeds[i]) for i in range(8)}
    aad_stats = {i: calculate_stats(dir_accels[i]) for i in range(8)}
    
    # Статистики скоростей и ускорений по типам действий
    ata_stats = {
        'MM': calculate_stats(type_speeds['MM']),
        'PC': calculate_stats(type_speeds['PC']),
        'DD': calculate_stats(type_speeds['DD'])
    }
    
    aaa_stats = {
        'MM': calculate_stats(type_accels['MM']),
        'PC': calculate_stats(type_accels['PC']),
        'DD': calculate_stats(type_accels['DD'])
    }

    # 10. Формирование результата
    features = {
        # Базовые характеристики
        'total_path_length': total_distance,
        'displacement': displacement,
#        'directness_ratio': displacement / total_distance if total_distance > 0 else 0,
        'average_curvature': float(avg_curvature),
        'mass_center_transportation': float(tcm),
        'scattering_coefficient': float(sc),
        
        # Координаты
#        'position_mean_x': float(np.mean(x)),
#        'position_mean_y': float(np.mean(y)),
#        'position_std_x': float(np.std(x)) if len(x) > 1 else 0.0,
#        'position_std_y': float(np.std(y)) if len(y) > 1 else 0.0,
        
        # Глобальные статистики скорости и ускорения
        'speed_min': float(np.min(speeds)) if len(speeds) > 0 else 0.0,
        'speed_max': float(np.max(speeds)) if len(speeds) > 0 else 0.0,
        'speed_mean': float(np.mean(speeds)) if len(speeds) > 0 else 0.0,
        'speed_std': float(np.std(speeds)) if len(speeds) > 1 else 0.0,
        
        'accel_min': float(np.min(accelerations)) if len(accelerations) > 0 else 0.0,
        'accel_max': float(np.max(accelerations)) if len(accelerations) > 0 else 0.0,
        'accel_mean': float(np.mean(accelerations)) if len(accelerations) > 0 else 0.0,
        'accel_std': float(np.std(accelerations)) if len(accelerations) > 1 else 0.0,
        
        # Гистограмма направлений (MDH)
        **{f'mdh_dir_{i}': float(dir_hist[i]) for i in range(8)},
        
        # Гистограмма типов действий (ATH)
        'ath_mouse_move': float(ath[0]),
        'ath_point_click': float(ath[1]),
        'ath_drag_drop': float(ath[2]),
        
        # Статистики скоростей по направлениям (MDA)
        **{f'mda_dir_{i}_min': mda_stats[i]['min'] for i in range(8)},
        **{f'mda_dir_{i}_max': mda_stats[i]['max'] for i in range(8)},
        **{f'mda_dir_{i}_mean': mda_stats[i]['mean'] for i in range(8)},
        **{f'mda_dir_{i}_std': mda_stats[i]['std'] for i in range(8)},
        
        # Статистики ускорений по направлениям (AAD)
        **{f'aad_dir_{i}_min': aad_stats[i]['min'] for i in range(8)},
        **{f'aad_dir_{i}_max': aad_stats[i]['max'] for i in range(8)},
        **{f'aad_dir_{i}_mean': aad_stats[i]['mean'] for i in range(8)},
        **{f'aad_dir_{i}_std': aad_stats[i]['std'] for i in range(8)},
        
        # Статистики скоростей по типам действий (ATA)
        'ata_mm_min': ata_stats['MM']['min'],
        'ata_mm_max': ata_stats['MM']['max'],
        'ata_mm_mean': ata_stats['MM']['mean'],
        'ata_mm_std': ata_stats['MM']['std'],
        
        'ata_pc_min': ata_stats['PC']['min'],
        'ata_pc_max': ata_stats['PC']['max'],
        'ata_pc_mean': ata_stats['PC']['mean'],
        'ata_pc_std': ata_stats['PC']['std'],
        
        'ata_dd_min': ata_stats['DD']['min'],
        'ata_dd_max': ata_stats['DD']['max'],
        'ata_dd_mean': ata_stats['DD']['mean'],
        'ata_dd_std': ata_stats['DD']['std'],
        
        # Статистики ускорений по типам действий (AAA)
        'aaa_mm_min': aaa_stats['MM']['min'],
        'aaa_mm_max': aaa_stats['MM']['max'],
        'aaa_mm_mean': aaa_stats['MM']['mean'],
        'aaa_mm_std': aaa_stats['MM']['std'],
        
        'aaa_pc_min': aaa_stats['PC']['min'],
        'aaa_pc_max': aaa_stats['PC']['max'],
        'aaa_pc_mean': aaa_stats['PC']['mean'],
        'aaa_pc_std': aaa_stats['PC']['std'],
        
        'aaa_dd_min': aaa_stats['DD']['min'],
        'aaa_dd_max': aaa_stats['DD']['max'],
        'aaa_dd_mean': aaa_stats['DD']['mean'],
        'aaa_dd_std': aaa_stats['DD']['std'],
    }
    
    return features

def preprocess_data(filepath):
    """
    Оптимизированная предварительная обработка данных:
    1. Удаляет полностью дублирующиеся записи
    2. Гарантирует уникальность временных меток
    3. Перезаписывает файл обработанными данными

    Оптимизации:
    - Векторизованные операции вместо итераций
    - Минимизация операций с DataFrame
    - Использование numpy для быстрых вычислений
    - Однократная сортировка данных
    """
    try:
        # Чтение с указанием dtype для ускорения загрузки
        df = pd.read_csv(filepath, dtype={
            'record timestamp': 'float64',
            'client timestamp': 'float64',
            'button': 'category',
            'state': 'category',
            'x': 'int32',
            'y': 'int32'
        })
    except (pd.errors.EmptyDataError, FileNotFoundError) as e:
        print(f"Error reading file: {e}")
        return False

    if df.empty:
        print("Warning: Empty DataFrame")
        return False

    # 1. Удаление дубликатов (быстрее чем drop_duplicates)
    unique_rows = df[~df.duplicated()]
    if len(unique_rows) < len(df):
        print(f"Removed {len(df) - len(unique_rows)} duplicate rows")
        df = unique_rows

    # 2. Обработка дубликатов временных меток
    ts_values = df['record timestamp'].values
    sorted_indices = np.argsort(ts_values)
    sorted_ts = ts_values[sorted_indices]
    
    # Находим дубликаты в отсортированном массиве
    diff = np.diff(sorted_ts)
    duplicates = np.where(diff == 0)[0]
    
    if duplicates.size > 0:
        print(f"Found {duplicates.size} duplicate timestamp groups, processing...")
        
        # Создаем маску всех дубликатов
        duplicate_mask = np.zeros_like(ts_values, dtype=bool)
        for i in duplicates:
            duplicate_mask[sorted_indices[i]] = True
            duplicate_mask[sorted_indices[i+1]] = True
        
        # Генерируем уникальные метки для дубликатов
        base_ts = sorted_ts[duplicates]
        offsets = np.arange(1, len(duplicates)+1) * 1e-6
        new_ts = base_ts + offsets
        
        # Обновляем значения в исходном массиве
        for i, idx in enumerate(duplicates):
            ts_values[sorted_indices[idx+1]] = new_ts[i]
        
        # Обновляем DataFrame одним присваиванием
        df['record timestamp'] = ts_values
        
        # Сортировка (один раз в конце)
        df = df.sort_values('record timestamp').reset_index(drop=True)

    # 3. Запись в файл с оптимизацией
    try:
        df.to_csv(filepath, index=False, float_format='%.6f')
        print(f"Successfully processed: {filepath}")
        return True
    except Exception as e:
        print(f"Error writing file: {e}")
        return False

def process_sessions_in_directory(directory, segment_duration = 5.0):
    """
    Обрабатывает все файлы сессий в указанной директории и возвращает DataFrame с признаками.
    """
    data = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.startswith("session_"): # Убеждаемся что это файл csv
            filepath = os.path.join(directory, filename)
            segments = extract_segmented_features(filepath, segment_duration)
            for segment in segments:
                data.append(segment)
                filenames.append(filename)
    if not data:  # Если data пустой, возвращаем None
        return None

    df = pd.DataFrame(data)
    df['filename'] = filenames # Добавляем column filename
    return df

def prepare_dataset(directory):
    all_training_data = []
    for user_folder in os.listdir(directory):
        user_path = os.path.join(directory, user_folder)
        training_data = process_sessions_in_directory(user_path)
        all_training_data.append(training_data)
    if all_training_data:
        training_data = pd.concat(all_training_data, ignore_index=True)
        print("Shape of combined training data:", training_data.shape)  # Проверяем размерность
    else:
        print("No training data found. Check directory structure.")
        exit()
    return training_data

In [5]:
import telebot
TOKEN = '***' 
CHAT_ID = '***'
# -------------------------------

bot = telebot.TeleBot(TOKEN)

def send_completion_message(message):
  try:
    bot.send_message(CHAT_ID, message)
    print("Сообщение отправлено в Telegram.")
  except Exception as e:
    print(f"Ошибка при отправке сообщения в Telegram: {e}")

In [6]:
train_directory = 'training_files'
test_directory = 'test_files'
public_labels_file = 'public_labels.csv'
public_labels = pd.read_csv(public_labels_file, index_col='filename')
segment_duration = np.arange(30, 1200, 30)
mean_auc = []
for duration in segment_duration:
    message = f'Processing duration = {duration}.....'
    send_completion_message(message)
    history = []
    for user_folder in os.listdir(train_directory):
        train_path = os.path.join(train_directory, user_folder)
        print(f"Processing {train_path}")
        training_data = process_sessions_in_directory(train_path, duration)
        feature_names = training_data.columns.drop('filename')
        scaler = StandardScaler()
        X_train = scaler.fit_transform(training_data[feature_names])
        print(X_train, X_train.shape)
        
        all_test_data = []
        test_filenames = []
        test_path = os.path.join(test_directory, user_folder)
        print(f"Processing {test_path}")
        for filename in os.listdir(test_path):
            if filename.startswith("session_"): # Убеждаемся что это файл csv
                filepath = os.path.join(test_path, filename)
                segments = extract_segmented_features(filepath, duration)
                for segment in segments:
                    all_test_data.append(segment)
                    test_filenames.append(filename)
        test_data = pd.DataFrame(all_test_data)
        test_data['filename'] = test_filenames # Добавляем столбец filename
        test_data_scaled = scaler.transform(test_data[feature_names])
        test_data_scaled = pd.DataFrame(test_data_scaled, columns=feature_names)
        # 4. Добавление меток из public_labels и удаление данных без меток
        test_data_scaled['is_illegal'] = 0  # Default: not illegal
        valid_indices = []
        for index, filename in enumerate(test_data['filename']):
            if filename in public_labels.index:
                test_data_scaled.loc[index, 'is_illegal'] = public_labels.loc[filename, 'is_illegal']
                valid_indices.append(index)
        test_data_scaled = test_data_scaled.iloc[valid_indices].reset_index(drop=True)
        test_data_filenames = test_data['filename'].iloc[valid_indices].reset_index(drop=True), public_labels
        test_labels = test_data_scaled['is_illegal'].values
        print(test_data_scaled, test_data_scaled.shape)
    
        input_dim = X_train.shape[1]
        autoencoder = create_convolutional_autoencoder(input_dim)
        hist = []
        print('Learning...')
        autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mse'])
        for i in range(5):
            autoencoder.fit(X_train, X_train, epochs=1, batch_size=32, shuffle=True, verbose=0)     
            X_test = test_data_scaled[feature_names].values
            encoded_test = autoencoder.predict(X_test, verbose=0)
            mse = np.mean(np.power(X_test - encoded_test, 2), axis=1)
            #print(mse)
            #print(np.mean(mse))
            #print(np.max(mse))
            auc_roc = roc_auc_score(test_labels, mse)
            hist.append(auc_roc)
            print(f"    epoch: {i}", f"AUC-ROC: {auc_roc}", f"mse: {np.mean(mse)}")
        history.append(np.max(hist))
        print(np.max(hist))
        print(np.argmax(hist))
        message = f'User: {user_folder} Max AUC: {np.max(hist)}'
        send_completion_message(message)
    message = f'Mean AUC: {np.mean(history)}\n____________________________________'
    send_completion_message(message)
    mean_auc.append(np.mean(history))
message = f'Results\nSegments: {segment_duration}\nAUC: {mean_auc}\nFinished!'
send_completion_message(message)

Сообщение отправлено в Telegram.
Processing training_files\user12
[[ 0.32940146 -0.033697    0.52372377 ... -0.03598413  0.1073698
  -0.11743794]
 [-0.6428161  -0.05941286  1.845295   ... -0.0359908   0.11203042
  -0.12380134]
 [-0.68253443 -0.08373932  0.86941102 ... -0.0359908   0.11203042
  -0.12380134]
 ...
 [-0.31498245  0.0661058  -0.77655827 ... -0.0359908   0.11203042
  -0.12380134]
 [-0.45453093 -0.11566177 -0.15947411 ... -0.0359908   0.11203042
  -0.12380134]
 [-0.34675234 -0.09489202 -0.53792788 ... -0.0359908   0.11203042
  -0.12380134]] (2601, 112)
Processing test_files\user12
      total_path_length  displacement  average_curvature  \
0              0.009988     -0.117487           0.102972   
1             -0.258873     -0.063770           1.419743   
2             -0.123537     -0.041591           1.660296   
3              0.001748     -0.059173           1.319946   
4             -0.326235      0.008420           3.000141   
...                 ...           ...     

In [20]:
# Пример использования
autoencoder = create_convolutional_autoencoder(112)
autoencoder.summary()

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 112, 1)]          0         
                                                                 
 conv1d_35 (Conv1D)          (None, 112, 64)           256       
                                                                 
 max_pooling1d_15 (MaxPoolin  (None, 56, 64)           0         
 g1D)                                                            
                                                                 
 conv1d_36 (Conv1D)          (None, 56, 32)            6176      
                                                                 
 max_pooling1d_16 (MaxPoolin  (None, 28, 32)           0         
 g1D)                                                            
                                                                 
 conv1d_37 (Conv1D)          (None, 28, 16)            1552

In [11]:
training_data = prepare_dataset('kek_train')
feature_names = training_data.columns.drop('filename') # Исключаем 'filename'
print(feature_names)

Shape of combined training data: (3182, 113)
Index(['total_path_length', 'displacement', 'average_curvature',
       'mass_center_transportation', 'scattering_coefficient', 'speed_min',
       'speed_max', 'speed_mean', 'speed_std', 'accel_min',
       ...
       'aaa_mm_mean', 'aaa_mm_std', 'aaa_pc_min', 'aaa_pc_max', 'aaa_pc_mean',
       'aaa_pc_std', 'aaa_dd_min', 'aaa_dd_max', 'aaa_dd_mean', 'aaa_dd_std'],
      dtype='object', length=112)


In [21]:
scaler = StandardScaler()
X_train = scaler.fit_transform(training_data[feature_names])
print(X_train)
input_dim = X_train.shape[1]
print(input_dim)

[[-0.10807157 -0.06003033  0.2143638  ... -0.14235095  0.21716518
  -0.1739552 ]
 [-0.27967126 -0.10000192  0.54312028 ... -0.14235095  0.21716518
  -0.1739552 ]
 [-0.216894   -0.07705276  1.71279973 ... -0.14235095  0.21716518
  -0.1739552 ]
 ...
 [-0.08226976 -0.04246469 -0.04471772 ... -0.14235095  0.21716518
  -0.1739552 ]
 [ 0.36757189  0.02839085  0.19995197 ... -0.14235095  0.21716518
  -0.1739552 ]
 [-0.3058281  -0.10262579 -0.12415583 ... -0.14235095  0.21716518
  -0.1739552 ]]
112


In [22]:
autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='huber', metrics=['mse'])
autoencoder.fit(X_train, X_train, epochs=1, batch_size=32, shuffle=False, verbose=1)



<keras.callbacks.History at 0x231d410d4c0>

In [23]:
X_encoded = autoencoder.predict(X_train, verbose=1)
mse = np.mean(np.power(X_train - X_encoded, 2), axis=1)
print(mse)
print(np.mean(mse))
print(np.max(mse))

[ 0.21232687  0.18909328 14.62028939 ...  0.61773482  0.32197444
  0.33393691]
1.0000154889362878
65.21808618024276


In [24]:
X_encoded.shape

(3182, 112)