# Data Exploration

## Setup

### Downloading Librabies

In [None]:
%pip install pandas
%pip install spacy
%pip install nltk
%pip install scikit-learn
%pip install tensorflow
%pip install user-agents
%pip install  matplotlib

### Importing Librabies

In [None]:
import pandas as pd
from user_agents import parse
import numpy as np

## Reading Data

### Read User behaviors and Non statistical exploration

#### Typing Data

In [None]:
key_stroke = pd.read_csv("../DataSets/UserBehaviors/Keystroke/free-text.csv")

key_stroke = key_stroke.iloc[:, :-1]

# Get the unique user IDs
#first_10_user_ids = key_stroke['participant'].unique()[:10]

# Filter the data for the first 10 unique users
#first_10_users_data = key_stroke[key_stroke['participant'].isin(first_10_user_ids)]

# Drop unneeded columns
#first_10_users_data = first_10_users_data.drop(first_10_users_data.columns[-1], axis=1)

# Display head
key_stroke.isnull().sum()

#### Mouse movement && Session Info

In [None]:
session_info = pd.read_csv("../DataSets/UserBehaviors/mousedynamics/EVTRACKINFO/EVTRACKINFO.csv", sep='\t')
mouse_movements = pd.read_csv("../DataSets/UserBehaviors/mousedynamics/EVTRACKTRACK/EVTRACKTRACK.csv", sep='\t')


# Filter uneed data
mouse_data = mouse_movements[mouse_movements['event'].str.contains('mouse', case=False, na=False)]

# Display head
# Drop unneeded columns
del session_info['_id']
del mouse_data['_id']
del mouse_data['cursor']
del session_info['documentw']
del session_info['documenth']
del session_info['date']
del mouse_data['key']

print(mouse_data.isnull().sum())
print(session_info.isnull().sum())

In [None]:
def extract_browser_info_and_os(ua_string):
    # Parse the user agent string
    user_agent = parse(ua_string)

    # Extract browser and operating system
    browser = user_agent.browser
    os = user_agent.os
    device =  user_agent.device

    return {"browser": str(browser), "os": str(os), "device": str(device)}

# Apply extraction to session_info
session_info[['browser', 'os', 'device']] = session_info['ua'].apply(lambda ua: pd.Series(extract_browser_info_and_os(ua)))

session_info

## LSTM Model

In [None]:
# Merge mouse_data with session_info based on user and session_id
merged_data = mouse_data.merge(session_info[['user', 'session_id', 'browser', 'os']], on=['user', 'session_id'], how='left')

print("\nMerged Data with Browser and OS:")
print(len(merged_data))
print(len(mouse_data))

# Count unique operating systems
unique_os_count = merged_data['os'].nunique()
print(f"Number of unique operating systems: {unique_os_count}")

# Count unique browsers
unique_browsers_count = merged_data['browser'].nunique()
print(f"Number of unique browsers: {unique_browsers_count}")

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler

#le_merchant = LabelEncoder()
#merged_data['browser'] = le_merchant.fit_transform(merged_data['browser'].values)

#le_transaction_type = LabelEncoder()
#merged_data['os'] = le_transaction_type.fit_transform(merged_data['os'].values)

# Count unique operating systems
unique_os_count = merged_data['os'].nunique()
print(f"Number of unique operating systems: {unique_os_count}")

# Count unique browsers
unique_browsers_count = merged_data['browser'].nunique()
print(f"Number of unique browsers: {unique_browsers_count}")

print(len(merged_data[merged_data["user"] == "Alluserspreauth"]))
print(len(merged_data[merged_data["user"] != "Alluserspreauth"]))
print(len(merged_data))

print(merged_data)

model_mouse_data = merged_data[merged_data["user"] != "Alluserspreauth"]

In [None]:
key_stroke.rename(columns={'participant': 'user'}, inplace=True)
# Step 2: Limit to the first 11 users
selected_users = key_stroke['user'].drop_duplicates().head(11)

x = 1
# Step 3: Rename users to Userx format
for user in selected_users:
    key_stroke.loc[key_stroke['user'] == user, 'user'] = "User" + str(x)
    x = x+1

key_stroke = key_stroke[key_stroke["user"].str.contains("User", case=False, na=False)]

# Step 2: Create a pseudo-timestamp for keystrokes
key_stroke['timestamp'] = pd.date_range(start='2020-02-17 09:35:49.089692', periods=60129, freq='1s')

key_stroke

In [None]:
# Step 1: Merge mouse and keystroke data
# We will use a time window to align keystrokes with mouse events
#time_window = 3  # Define a time window of 5 seconds

def merge_with_time_window(mouse_data, keystroke_data, time_window):
    # Ensure timestamps are in datetime format
    mouse_data.loc[:, 'timestamp'] = pd.to_datetime(mouse_data['timestamp'])
    keystroke_data.loc[:, 'timestamp'] = pd.to_datetime(keystroke_data['timestamp'])
    
    # Ensure time_window is an integer or float
    if not isinstance(time_window, (int, float)):
        raise ValueError("time_window must be an int or float representing seconds.")
    
    merged_data = []
    for _, mouse_row in mouse_data.iterrows():
        # Get the user ID and timestamp for the mouse movement
        user_id = mouse_row['user']
        mouse_time = mouse_row['timestamp']
        
        # Find keystrokes within the time window
        keystrokes_within_window = keystroke_data[
            (keystroke_data['user'] == user_id) & 
            (keystroke_data['timestamp'] >= mouse_time - pd.Timedelta(seconds=time_window)) & 
            (keystroke_data['timestamp'] <= mouse_time + pd.Timedelta(seconds=time_window))
        ]
        
        for _, keystroke_row in keystrokes_within_window.iterrows():
            # Collect relevant information for each mouse and keystroke pair
            merged_data.append({
                'mouse_timestamp': mouse_time,
                'user': user_id,
                'xpos': mouse_row['xpos'],
                'ypos': mouse_row['ypos'],
                'keystroke_timestamp': keystroke_row['timestamp'],
                'key1': keystroke_row['key1'],
                'key2': keystroke_row['key2'],
                'DU.key1.key1': keystroke_row['DU.key1.key1'],
                'DD.key1.key2': keystroke_row['DD.key1.key2'],
                'DU.key1.key2': keystroke_row['DU.key1.key2'],
                'UD.key1.key2': keystroke_row['UD.key1.key2'],
                'UU.key1.key2': keystroke_row['UU.key1.key2'],
                'event': mouse_row['event'],
                'os': mouse_row['os'],
                'browser': mouse_row['browser']
            })

    # Convert the merged data into a DataFrame
    merged_df = pd.DataFrame(merged_data)
    
    return merged_df

In [None]:
# Merge mouse and keystroke data
combined_data = merge_with_time_window(model_mouse_data, key_stroke, 3)
print(combined_data)

combined_data.dropna(inplace=True)

# Step 2: Feature Engineering
combined_data['key_encoded_1'] = LabelEncoder().fit_transform(combined_data['key1'].fillna(''))  # Encode keys
combined_data['key_encoded_2'] = LabelEncoder().fit_transform(combined_data['key2'].fillna(''))  # Encode keys
combined_data['event'] = LabelEncoder().fit_transform(combined_data['event'].fillna(''))  # Encode keys
combined_data['browser'] = LabelEncoder().fit_transform(combined_data['browser'].fillna(''))  # Encode keys
combined_data['os'] = LabelEncoder().fit_transform(combined_data['os'].fillna(''))  # Encode keys

standard_scaler = StandardScaler()
combined_data[['xpos', 'ypos', 'DU.key1.key1', 'DD.key1.key2', 'DU.key1.key2', 'UD.key1.key2', 'UU.key1.key2']] = standard_scaler.fit_transform(combined_data[['xpos', 'ypos', 'DU.key1.key1', 'DD.key1.key2', 'DU.key1.key2', 'UD.key1.key2', 'UU.key1.key2']])


# Step 3: Prepare data for LSTM
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        sequences.append(seq)
    return np.array(sequences)

# Select features for LSTM
features = combined_data[['xpos', 'ypos', 'key_encoded_1', 'key_encoded_2', 'event', 'os', 'browser', 'DU.key1.key1', 'DD.key1.key2', 'DU.key1.key2', 'UD.key1.key2', 'UU.key1.key2']].values
SEQ_LENGTH = 12
X = create_sequences(features, SEQ_LENGTH)

# Verify the shapes of the output
print(X)

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from tensorflow.keras.regularizers import l2

def create_lstm_autoencoder(timesteps, n_features, latent_dim=64):
    # Input layer
    inputs = Input(shape=(timesteps, n_features))

    # Encoder with optimized architecture
    encoded = LSTM(256, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.003))(inputs)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.4)(encoded)
    encoded = LSTM(128, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.003))(encoded)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.4)(encoded)
    encoded = LSTM(64, activation='tanh', return_sequences=False, kernel_regularizer=l2(0.003))(encoded)
    encoded = BatchNormalization()(encoded)

    # Latent space representation
    latent = Dense(latent_dim, activation='tanh')(encoded)

    # Decoder with optimized architecture
    decoded = RepeatVector(timesteps)(latent)
    decoded = LSTM(64, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.003))(decoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dropout(0.4)(decoded)
    decoded = LSTM(128, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.003))(decoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dropout(0.4)(decoded)
    decoded = LSTM(256, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.003))(decoded)

    # Output layer
    output = TimeDistributed(Dense(n_features))(decoded)

    # Full autoencoder model
    autoencoder = Model(inputs, output)
    return autoencoder

In [None]:
# Instantiate and compile the model
latent_dim = 32
global_autoencoder = create_lstm_autoencoder(SEQ_LENGTH, X.shape[2], latent_dim)
lr_schedule = ExponentialDecay(
    initial_learning_rate=0.001, decay_steps=10000, decay_rate=0.9
)

# Compile with Adam and learning rate scheduler
global_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='mae')

# Optionally add Early Stopping for training
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Assuming 'global_transaction_data' is the dataset for general training (preprocessed)
# Train the global model
histogram_global = global_autoencoder.fit(X, X, epochs=180, batch_size=16, validation_split=0.2)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(histogram_global.history['loss'], label='Training Loss')
plt.plot(histogram_global.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Learning Curve For Transactions')
plt.legend()
plt.show()

In [None]:
from joblib import Parallel, delayed 
import joblib 
  
  
# Save the model as a pickle in a file 
joblib.dump(global_autoencoder, 'behavioral_model.pkl') 

In [None]:
import pickle


# Path to the pickle file
pickle_file = 'TrainedModels/behavioral_model.pkl'

# Load the model from the pickle file
with open(pickle_file, 'rb') as file:
    global_autoencoder = pickle.load(file)


In [None]:
import numpy as np
import pandas as pd

# Function to detect anomalies with multiple thresholds
def detect_anomalies_with_confidence(data, model, thresholds):
    # Get reconstructed data
    reconstructed = model.predict(data)
    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(data - reconstructed), axis=(1, 2))
    
     # Initialize an empty list to store anomaly results
    anomaly_results = []

    print(reconstruction_errors)
    
    # Compare reconstruction errors with thresholds to find sureness level
    # Compare reconstruction errors with thresholds to classify anomaly level
    for error in reconstruction_errors:  # 'error' is now a scalar
        if error > thresholds['high']:
            anomaly_results.append('high')
        elif error > thresholds['medium']:
            anomaly_results.append('medium')
        elif error > thresholds['low']:
            anomaly_results.append('low')
        else:
            anomaly_results.append('none')
    
    return anomaly_results, reconstruction_errors

# Define different thresholds for different confidence levels
# Adjust these based on your validation set or domain knowledge
thresholds = {
    'low': 6,     # High confidence (only very anomalous points)
    'medium': 8,  # Medium confidence
    'high': 10     # Low confidence (more points classified as anomalies)
}

anomaly_levels, reconstruction_errors = detect_anomalies_with_confidence(X, global_autoencoder, thresholds)

# View the results
print(anomaly_levels)

In [None]:
def build_fraud_detection_model(autoencoder):
    # Freeze all encoder layers
    for layer in autoencoder.layers[:5]:  # Assuming the first 5 layers are the encoder layers
        layer.trainable = False

    # Extract the encoder output
    encoder_output = autoencoder.layers[-7].output  # Adjust index if needed based on the model structure

    # Define a new model combining encoder and classifier
    fraud_detector = Model(inputs=autoencoder.input, outputs=encoder_output)

    # Compile the model with a binary cross-entropy loss
    fraud_detector.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    return fraud_detector

# Create the fraud detection model using the pre-trained autoencoder
fraud_detector = build_fraud_detection_model(global_autoencoder)

# For now, you can use the valid transactions to simulate training
# Example:
fraud_detector.fit(X_train, X_train, epochs=10, batch_size=16, validation_split=0.2)

In [None]:
import numpy as np

# Define a function to calculate reconstruction error
def calculate_reconstruction_error(model, data):
    reconstructed_data = model.predict(data)
    errors = np.mean(np.square(data - reconstructed_data), axis=(1, 2))  # Mean squared error for each sample
    return errors

# Calculate reconstruction error on validation data
validation_errors = calculate_reconstruction_error(fraud_detector, X_val)

# Set a threshold based on the reconstruction error distribution of valid data
threshold = np.percentile(validation_errors, 95)  # 95th percentile of validation error as threshold

# Function to detect fraud based on reconstruction error
def detect_fraud(model, transaction_data, threshold):
    error = calculate_reconstruction_error(model, transaction_data)
    fraud_flags = error > threshold
    return fraud_flags, error

# Use the function to detect fraud
fraud_flags, transaction_errors = detect_fraud(autoencoder, new_transactions, threshold)

# `fraud_flags` will be True for transactions likely to be fraud
# `transaction_errors` gives the reconstruction error for each transaction

In [None]:
from keras.models import load_model

# Load user-specific data
# Assuming you have a function to prepare user-specific data
user = combined_data[combined_data['user']=='User1']
features = user[['xpos', 'ypos', 'key_encoded_1', 'key_encoded_2', 'event', 'os', 'browser', 'DU.key1.key1', 'DD.key1.key2', 'DU.key1.key2', 'UD.key1.key2', 'UU.key1.key2']].values
X_user, y_user = create_sequences(features, SEQ_LENGTH)  # User-specific dataset

# Load the pre-trained model
#pretrained_model = load_model('pretrained_lstm_model.h5')

# Fine-tune the model on user-specific data
# Optionally, you can freeze some layers if the dataset is very small
for layer in model.layers[:-2]:  # Freeze all layers except the last two
    layer.trainable = False

# Compile the model again after modifying the layers
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')

# Fine-tune the model
history_user = model.fit(X_user, X_user, epochs=5, batch_size=8, validation_split=0.2)

# Save the fine-tuned model
#pretrained_model.save('fine_tuned_model_user.h5')