# Data Exploration

## Setup

### Downloading Librabies

In [None]:
%pip install pandas
%pip install spacy
%pip install nltk
%pip install scikit-learn
%pip install tensorflow

### Importing Librabies

In [None]:
import pandas as pd
import re

## Reading Data

### Read Transactional Data and Non statistical exploration

Reading data and renaming columns as well as dropping uneeded columns

In [None]:
transactional_data = pd.read_csv("../DataSets/TransactionalData/Synthetic_Financial_datasets_log.csv")

transactional_data = transactional_data[transactional_data['type'] != 'CASH_IN']

transactional_data['nameOrig'] = transactional_data['nameOrig'].str[:4]
transactional_data['nameDest'] = transactional_data['nameDest'].apply(lambda x: x[:3] if x.startswith('M') else x)

print("Fraud: " + str(len(transactional_data[transactional_data['isFraud'] == 1])) + "; Not Fraud: " + str(len(transactional_data[transactional_data['isFraud'] == 0])) + "; Flagged Fraud: " + str(len(transactional_data[transactional_data['isFlaggedFraud'] == 1])))

user_transaction_count = transactional_data.groupby('nameOrig').size().reset_index(name='transaction_count')

transactional_data.loc[transactional_data['nameDest'].str.startswith('C'), 'nameDest'] = 'user'

unique_users_count = transactional_data['nameOrig'].nunique()
unique_merch_count = transactional_data['nameDest'].nunique()

print(f'Total distinct users: {unique_users_count}')
print(f'Total distinct merch: {unique_merch_count}')


# Sort by the number of transactions in descending order
sorted_users = user_transaction_count.sort_values(by='transaction_count', ascending=False)

top_10_users = sorted_users.head(10)
print(top_10_users)


#del transactional_data['newbalanceDest']



# Assuming your simulation starts at a specific date and time
start_datetime = '2023-01-01 00:00:00'  # Replace with your actual start date and time
steps = transactional_data['step'].max()  # Total simulation steps (hours)

# Create a date range for the number of steps, incrementing by 1 hour
date_range = pd.date_range(start=start_datetime, periods=steps, freq='1d')

# Total number of records
total_records = len(transactional_data)


# Assign datetime based on the step
transactional_data['transaction_date'] = transactional_data['step'].apply(lambda x: date_range[x-1])

# Display the first few rows of the DataFrame
print(transactional_data['type'].unique())

print(len(transactional_data))

Insure all the data is in correct format for exploration

In [None]:
import numpy as np
chars_to_remove = [' ', ',']

for char in chars_to_remove:
    transactional_data['amount'] = transactional_data['amount'].replace(char, '', regex=True)
    transactional_data['oldbalanceOrg'] = transactional_data['oldbalanceOrg'].replace(char, '', regex=True)
    transactional_data['newbalanceOrig'] = transactional_data['newbalanceOrig'].replace(char, '', regex=True)

transactional_data['amount'] = transactional_data['amount'].astype(np.float32)
transactional_data['oldbalanceOrg'] = transactional_data['oldbalanceOrg'].astype(np.float32)
transactional_data['newbalanceOrig'] = transactional_data['newbalanceOrig'].astype(np.float32)

#sorted_data = data = transactional_data.sort_values(by=['nameOrig', 'step'])

In [None]:
df = transactional_data[transactional_data['isFraud'] == 0]

#df = transactional_data
# Calculate 10% of the original size
sample_size = int(len(df) * 0.05)

df.sort_values('step')

df = df.head(sample_size)

print(len(df[df['isFraud'] == 1]))
print(len(df))

df_later = transactional_data[sample_size:sample_size*4]

print(len(df_later[df_later['isFraud'] == 1]))
print(len(df_later))

print(len(df_later[df_later['isFlaggedFraud'] == 1]))

In [None]:
df.sort_values(by=['transaction_date', 'step'], inplace=True)
df_later.sort_values(by=['transaction_date', 'step'], inplace=True)
users = df_later["nameOrig"].unique()
user_data =  df_later[df_later["nameOrig"] == users[0]][:1000]
user_data_later =  df_later[df_later["nameOrig"] == users[0]][1000:]

In [None]:
import matplotlib.pyplot as plt

flagged = transactional_data[transactional_data['isFraud']==1]['isFlaggedFraud'].value_counts()


# Plot the bar chart
plt.figure(figsize=(8, 6))
plt.bar(flagged.index, flagged.values, color='skyblue')

plt.xticks([0, 1], ['NotFlagged', 'Flagged'])

# Add labels and title
plt.xlabel('Fraudulent Transaction (0 = No, 1 = Yes)')
plt.ylabel('Number of Transactions')
plt.title('Number of Flagged Fraudulent Transactions in Data')

# Show the graph
plt.show()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

def create_data_for_model(data):
    # Step 4: Convert `transaction_date` to datetime and extract day, month, and day of week as features
    data.loc[:, 'transaction_date'] = pd.to_datetime(data['transaction_date'], format='%d-%b-%y')
    data.loc[:, 'day_of_week'] = data['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
    data.loc[:, 'month'] = data['transaction_date'].dt.month

    # Step 5: Encode categorical variables (transaction_type and merchant)
    le_merchant = LabelEncoder()
    data.loc[:, 'nameDest'] = le_merchant.fit_transform(data['nameDest'].values)

    le_transaction_type = LabelEncoder()
    data.loc[:, 'type'] = le_transaction_type.fit_transform(data['type'].values)

    print(data[['amount', 'oldbalanceOrg', 'newbalanceOrig']])
    standard_scaler = StandardScaler()
    data[['amount', 'oldbalanceOrg', 'newbalanceOrig']] = standard_scaler.fit_transform(data[['amount', 'oldbalanceOrg', 'newbalanceOrig']])

    print(data[['amount', 'oldbalanceOrg', 'newbalanceOrig']])
    print(data[['amount', 'oldbalanceOrg', 'newbalanceOrig']].dtypes)
    print(data[['amount', 'oldbalanceOrg', 'newbalanceOrig']].isna().sum())

    return data

df = create_data_for_model(df)

# Step 3: Prepare data for LSTM
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        sequences.append(seq)
    return np.array(sequences)

# Select features for LSTM
features = df[['amount', 'nameDest', 'oldbalanceOrg', 'newbalanceOrig', 'type', 'step', 'day_of_week', 'month']].values
SEQ_LENGTH = 12
X = create_sequences(features, SEQ_LENGTH)

# Verify the shapes of the output
print("Shape of X (sequences):", X.shape)


In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers.schedules import ExponentialDecay

def create_lstm_autoencoder(timesteps, n_features, latent_dim=64):
    # Input layer
    inputs = Input(shape=(timesteps, n_features))

    # Encoder with optimized architecture
    encoded = LSTM(256, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001))(inputs)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = LSTM(128, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001))(encoded)
    encoded = BatchNormalization()(encoded)
    encoded = Dropout(0.3)(encoded)
    encoded = LSTM(64, activation='tanh', return_sequences=False, kernel_regularizer=l2(0.001))(encoded)
    encoded = BatchNormalization()(encoded)

    # Latent space representation
    latent = Dense(latent_dim, activation='tanh')(encoded)

    # Decoder with optimized architecture
    decoded = RepeatVector(timesteps)(latent)
    decoded = LSTM(64, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001))(decoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dropout(0.3)(decoded)
    decoded = LSTM(128, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001))(decoded)
    decoded = BatchNormalization()(decoded)
    decoded = Dropout(0.3)(decoded)
    decoded = LSTM(256, activation='tanh', return_sequences=True, kernel_regularizer=l2(0.001))(decoded)

    # Output layer
    output = TimeDistributed(Dense(n_features))(decoded)

    # Full autoencoder model
    autoencoder = Model(inputs, output)
    return autoencoder

In [None]:
# Define LSTM autoencoder with chosen parameters
neuralnetwork = LSTMAutoencoder(
    hidden_units=hidden_units,
    num_layers=num_layers,
    dropout=dropout
).to(device=device)

# Use reconstruction loss for an autoencoder
criterion = nn.MSELoss()

# Training loop
for X_train_batch in train_loader:
    X_train_batch = X_train_batch.to(device=device)
    optimizer.zero_grad()
    y_batch_prediction = neuralnetwork(X_train_batch)
    loss = criterion(y_batch_prediction, X_train_batch)
    loss.backward()
    optimizer.step()

In [None]:
# Instantiate and compile the model
latent_dim = 64
global_autoencoder = create_lstm_autoencoder(SEQ_LENGTH, X.shape[2], latent_dim)
lr_schedule = ExponentialDecay(
    initial_learning_rate=0.001, decay_steps=10000, decay_rate=0.9
)

# Compile with Adam and learning rate scheduler
global_autoencoder.compile(optimizer=Adam(learning_rate=lr_schedule), loss='mae')

# Optionally add Early Stopping for training
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# If using a Pandas DataFrame, convert the DataFrame to a NumPy array and ensure numeric types
X = X.astype(np.float32)

# Assuming 'global_transaction_data' is the dataset for general training (preprocesse
# Train the global model
histogram_global = global_autoencoder.fit(X, X, epochs=30, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(histogram_global.history['loss'], label='Training Loss')
plt.plot(histogram_global.history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Model Learning Curve')
plt.legend()
plt.show()

In [None]:
from joblib import Parallel, delayed 
import joblib 
  
  
# Save the model as a pickle in a file 
joblib.dump(global_autoencoder, 'contextual_model.pkl') 

In [None]:
import pickle

df_later = create_data_for_model(df_later)
user_data = create_data_for_model(user_data)
user_data_later = create_data_for_model(user_data_later)

# Path to the pickle file
pickle_file = 'contextual_model.pkl'

# Load the model from the pickle file
with open(pickle_file, 'rb') as file:
    global_autoencoder = pickle.load(file)

# Now the model is loaded and you can use it

In [None]:
import numpy as np
import pandas as pd

# Function to detect anomalies with multiple thresholds
def detect_anomalies_with_confidence(data, model, thresholds):
    data = data.astype(np.float32)
    # Get reconstructed data
    reconstructed = model.predict(data)
    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(data - reconstructed), axis=(1, 2))
    
     # Initialize an empty list to store anomaly results
    anomaly_results = []

    print(reconstruction_errors)
    
    # Compare reconstruction errors with thresholds to find sureness level
    # Compare reconstruction errors with thresholds to classify anomaly level
    for error in reconstruction_errors:  # 'error' is now a scalar
        if error > thresholds['high']:
            anomaly_results.append('high')
        elif error > thresholds['medium']:
            anomaly_results.append('medium')
        elif error > thresholds['low']:
            anomaly_results.append('low')
        else:
            anomaly_results.append('none')
    
    return anomaly_results, reconstruction_errors

# Define different thresholds for different confidence levels
# Adjust these based on your validation set or domain knowledge
thresholds = {
    'low': 55,     # High confidence (only very anomalous points)
    'medium': 50,  # Medium confidence
    'high': 53     # Low confidence (more points classified as anomalies)
}

anomaly_levels, reconstruction_errors = detect_anomalies_with_confidence(X, global_autoencoder, thresholds)

# View the results
print(anomaly_levels)

In [None]:
import numpy as np
import pandas as pd

# Function to detect anomalies with multiple thresholds
def detect_anomalies_with_confidence(data, model, thresholds):        
    features_later = data[['amount', 'nameDest', 'oldbalanceOrg', 'newbalanceOrig', 'type', 'step', 'day_of_week', 'month']].values
    SEQ_LENGTH = 12
    X = create_sequences(features_later, SEQ_LENGTH)

    X = X.astype(np.float32)

    # Get reconstructed data
    reconstructed = model.predict(X)
    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(X - reconstructed), axis=(1, 2))
    
     # Initialize an empty list to store anomaly results
    anomaly_results = []
    
    # Compare reconstruction errors with thresholds to find sureness level
    for error in reconstruction_errors:
        if error > thresholds['high']:
            anomaly_results.append('high')
        elif error > thresholds['medium']:
            anomaly_results.append('medium')
        elif error > thresholds['low']:
            anomaly_results.append('low')
        else:
            anomaly_results.append('none')  # No anomaly detected
    
    return anomaly_results, reconstruction_errors

# Define different thresholds for different confidence levels
# Adjust these based on your validation set or domain knowledge
thresholds = {
    'low': 2400,     # High confidence (only very anomalous points)
    'medium': 2500,  # Medium confidence
    'high': 2600    # Low confidence (more points classified as anomalies)
}

anomaly_levels, reconstruction_errors = detect_anomalies_with_confidence(df_later, global_autoencoder, thresholds)

# Handle anomaly_levels length mismatch
if len(anomaly_levels) < len(df_later):
    anomaly_levels = np.concatenate([anomaly_levels, np.full(len(df_later) - len(anomaly_levels), np.nan)])
elif len(anomaly_levels) > len(df_later):
    anomaly_levels = anomaly_levels[:len(df_later)]

# Handle reconstruction_errors length mismatch
reconstruction_errors = np.concatenate([reconstruction_errors, np.full(len(df_later) - len(reconstruction_errors), np.nan)])

# Assign the corrected values to the DataFrame
df_later['anomaly_level'] = anomaly_levels
df_later['reconstruction_error'] = reconstruction_errors


In [None]:
from sklearn.metrics import recall_score

# Now, we can show the fraud label and anomaly level:
df_output = df_later[['isFraud', 'anomaly_level', 'reconstruction_error']]

true_negative = len(df_output[(df_output['anomaly_level'] == 'low') | (df_output['anomaly_level'] == 'none') & (df_output['isFraud'] == 0)]) 

true_positive = len(df_output[(df_output['anomaly_level'] != 'none') & (df_output['isFraud'] == 1)])

false_negative = len(df_output[(df_output['anomaly_level'] == 'none') & (df_output['isFraud'] == 1)])

false_positive = len(df_output[(df_output['anomaly_level'] != 'low') & (df_output['anomaly_level'] != 'none') & (df_output['anomaly_level'] != 'high')  & (df_output['isFraud'] == 0)])

acc = ((true_negative+true_positive)/len(df_output)) 


print(acc)
print(true_positive)
print(false_positive)
print(true_negative)
print(len(df_output[df_output['isFraud'] == 1]))
print(len(df_output))

exploration of users

In [None]:
def build_fraud_detection_model(autoencoder):
    # Freeze all encoder layers
    for layer in autoencoder.layers[:5]:  # Assuming the first 5 layers are the encoder layers
        layer.trainable = False

    # Extract the encoder output
    encoder_output = autoencoder.layers[-7].output  # Adjust index if needed based on the model structure

    # Define a new model combining encoder and classifier
    fraud_detector = Model(inputs=autoencoder.input, outputs=encoder_output)

    # Compile the model with a binary cross-entropy loss
    fraud_detector.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')
    
    return fraud_detector

# Create the fraud detection model using the pre-trained autoencoder
fraud_detector = build_fraud_detection_model(global_autoencoder)

# For now, you can use the valid transactions to simulate training
# Example:
fraud_detector.fit(user_data, user_data, epochs=10, batch_size=16, validation_split=0.2)

In [None]:
anomaly_levels, reconstruction_errors = detect_anomalies_with_confidence(user_data_later, fraud_detector, thresholds)

# Handle anomaly_levels length mismatch
if len(anomaly_levels) < len(df_later):
    anomaly_levels = np.concatenate([anomaly_levels, np.full(len(df_later) - len(anomaly_levels), np.nan)])
elif len(anomaly_levels) > len(df_later):
    anomaly_levels = anomaly_levels[:len(df_later)]

# Handle reconstruction_errors length mismatch
reconstruction_errors = np.concatenate([reconstruction_errors, np.full(len(df_later) - len(reconstruction_errors), np.nan)])

# Assign the corrected values to the DataFrame
df_later['anomaly_level'] = anomaly_levels
df_later['reconstruction_error'] = reconstruction_errors

In [None]:
from sklearn.metrics import recall_score

# Now, we can show the fraud label and anomaly level:
df_output = df_later[['isFraud', 'anomaly_level', 'reconstruction_error']]

true_negative = len(df_output[(df_output['anomaly_level'] == 'low') | (df_output['anomaly_level'] == 'none') & (df_output['isFraud'] == 0)]) 

true_positive = len(df_output[(df_output['anomaly_level'] != 'none') & (df_output['isFraud'] == 1)])

false_negative = len(df_output[(df_output['anomaly_level'] == 'none') & (df_output['isFraud'] == 1)])

false_positive = len(df_output[(df_output['anomaly_level'] != 'low') & (df_output['anomaly_level'] != 'none') & (df_output['anomaly_level'] != 'high')  & (df_output['isFraud'] == 0)])

acc = ((true_negative+true_positive)/len(df_output)) 


print(acc)
print(true_positive)
print(false_positive)
print(true_negative)
print(len(df_output[df_output['isFraud'] == 1]))
print(len(df_output))