In [1]:
import logging
import datetime

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, precision_recall_curve, auc
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard, EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Flatten, RepeatVector, TimeDistributed, Dropout, Embedding, Concatenate
from tensorflow.keras.optimizers import Adam

# import tensorboard 


In [17]:
filepath = 'data/financial_anomaly_data.csv'

try:
    fin_data_df = pd.read_csv(filepath, parse_dates=['Timestamp'])
    logging.info(f"Data loaded successfully from {filepath}")
except FileNotFoundError:   
    logging.error(f"File not found: {filepath}")
    exit()

In [18]:
#convert string to timestamp
fin_data_df['Timestamp'] = pd.to_datetime(fin_data_df['Timestamp'], format='%d-%m-%Y %H:%M')
type(fin_data_df['Timestamp'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [19]:
# 2. Add time-based features. Convert Timestamp to datetime and extract features
fin_data_df['hour'] = fin_data_df['Timestamp'].dt.hour
fin_data_df['dayofweek'] = fin_data_df['Timestamp'].dt.dayofweek
fin_data_df['dayofmonth'] = fin_data_df['Timestamp'].dt.day
fin_data_df['month'] = fin_data_df['Timestamp'].dt.month



In [20]:
print(fin_data_df.info())
print(fin_data_df.isnull().sum())
nan_rows = fin_data_df[fin_data_df.isna().any(axis=1)]
print(nan_rows)
fin_data_df = fin_data_df.dropna()
print(fin_data_df.info())
print(fin_data_df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 217441 entries, 0 to 217440
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   Timestamp        216960 non-null  datetime64[ns]
 1   TransactionID    216960 non-null  object        
 2   AccountID        216960 non-null  object        
 3   Amount           216960 non-null  float64       
 4   Merchant         216960 non-null  object        
 5   TransactionType  216960 non-null  object        
 6   Location         216960 non-null  object        
 7   hour             216960 non-null  float64       
 8   dayofweek        216960 non-null  float64       
 9   dayofmonth       216960 non-null  float64       
 10  month            216960 non-null  float64       
dtypes: datetime64[ns](1), float64(5), object(5)
memory usage: 18.2+ MB
None
Timestamp          481
TransactionID      481
AccountID          481
Amount             481
Merchant          

In [32]:
categorical_features = ['Merchant', 'TransactionType', 'Location', 'hour', 'dayofweek', 'dayofmonth', 'month'] # added time features
numerical_features = ['Amount']
identifier_features = ['AccountID_encoded'] 

In [22]:
#For each categorical feature, check the number of unique values
for feature in categorical_features:
    unique_values = fin_data_df[feature].nunique()
    print(f"Unique values in {feature}: {unique_values}")
#For each categorical feature, for each unique value, count the number of occurrences
for feature in categorical_features:
    value_counts = fin_data_df[feature].value_counts()
    print(f"Value counts in {feature}:\n{value_counts}\n")


Unique values in Merchant: 10
Unique values in TransactionType: 3
Unique values in Location: 5
Unique values in hour: 24
Unique values in dayofweek: 7
Unique values in dayofmonth: 31
Unique values in month: 5
Value counts in Merchant:
Merchant
MerchantF    21924
MerchantG    21891
MerchantD    21820
MerchantB    21766
MerchantI    21752
MerchantA    21699
MerchantJ    21654
MerchantE    21543
MerchantH    21518
MerchantC    21393
Name: count, dtype: int64

Value counts in TransactionType:
TransactionType
Transfer      72793
Purchase      72235
Withdrawal    71932
Name: count, dtype: int64

Value counts in Location:
Location
San Francisco    43613
New York         43378
London           43343
Los Angeles      43335
Tokyo            43291
Name: count, dtype: int64

Value counts in hour:
hour
8.0     9060
9.0     9060
10.0    9060
11.0    9060
12.0    9060
13.0    9060
14.0    9060
15.0    9060
16.0    9060
17.0    9060
18.0    9060
19.0    9060
20.0    9060
21.0    9060
22.0    9060
23.0

In [23]:
# Encoding AccountID
fin_data_df['AccountID_encoded'] = fin_data_df['AccountID'].astype('category').cat.codes
fin_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 216960 entries, 0 to 216959
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   Timestamp          216960 non-null  datetime64[ns]
 1   TransactionID      216960 non-null  object        
 2   AccountID          216960 non-null  object        
 3   Amount             216960 non-null  float64       
 4   Merchant           216960 non-null  object        
 5   TransactionType    216960 non-null  object        
 6   Location           216960 non-null  object        
 7   hour               216960 non-null  float64       
 8   dayofweek          216960 non-null  float64       
 9   dayofmonth         216960 non-null  float64       
 10  month              216960 non-null  float64       
 11  AccountID_encoded  216960 non-null  int8          
dtypes: datetime64[ns](1), float64(5), int8(1), object(5)
memory usage: 20.1+ MB


In [24]:
fin_data_df = fin_data_df.drop(columns=['TransactionID', 'Timestamp', 'AccountID'], errors='ignore')
fin_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 216960 entries, 0 to 216959
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Amount             216960 non-null  float64
 1   Merchant           216960 non-null  object 
 2   TransactionType    216960 non-null  object 
 3   Location           216960 non-null  object 
 4   hour               216960 non-null  float64
 5   dayofweek          216960 non-null  float64
 6   dayofmonth         216960 non-null  float64
 7   month              216960 non-null  float64
 8   AccountID_encoded  216960 non-null  int8   
dtypes: float64(5), int8(1), object(3)
memory usage: 15.1+ MB


In [25]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ],
    remainder='passthrough'  # Keep  AccountID
)


In [26]:
# Fit and transform the data
processed_data = preprocessor.fit_transform(fin_data_df)

In [31]:
print(processed_data.dtype)
print(processed_data.shape)

float64
(216960, 87)


In [28]:
fin_data_df.iloc[0]

Amount                95071.92
Merchant             MerchantH
TransactionType       Purchase
Location                 Tokyo
hour                       8.0
dayofweek                  6.0
dayofmonth                 1.0
month                      1.0
AccountID_encoded            9
Name: 0, dtype: object

In [33]:
feature_names = numerical_features + list(preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)) + identifier_features
len(feature_names)

87

In [34]:
print(processed_data.dtype)
processed_df = pd.DataFrame(processed_data, columns=feature_names)
processed_df.info()

float64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216960 entries, 0 to 216959
Data columns (total 87 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   Amount                      216960 non-null  float64
 1   Merchant_MerchantA          216960 non-null  float64
 2   Merchant_MerchantB          216960 non-null  float64
 3   Merchant_MerchantC          216960 non-null  float64
 4   Merchant_MerchantD          216960 non-null  float64
 5   Merchant_MerchantE          216960 non-null  float64
 6   Merchant_MerchantF          216960 non-null  float64
 7   Merchant_MerchantG          216960 non-null  float64
 8   Merchant_MerchantH          216960 non-null  float64
 9   Merchant_MerchantI          216960 non-null  float64
 10  Merchant_MerchantJ          216960 non-null  float64
 11  TransactionType_Purchase    216960 non-null  float64
 12  TransactionType_Transfer    216960 non-null  float64
 13  Transa

In [None]:
processed_df.groupby('AccountID')

In [35]:
# 5. Prepare data for LSTM
def create_sequences(df, sequence_length, account_id_col='AccountID'):
    """
    Groups data by AccountID, creates sequences, and returns data and corresponding AccountID sequences.

    Args:
        df: DataFrame with time series data, must include AccountID.
        sequence_length: Length of the sequences.
        account_id_col: Name of the column containing Account IDs.

    Returns:
        A tuple of (data_sequences, account_id_sequences).
        - data_sequences: NumPy array of shape (num_sequences, sequence_length, num_features).
        - account_id_sequences: NumPy array of shape (num_sequences,).
    """
    account_groups = df.groupby(account_id_col)
    data_sequences = []
    account_id_sequences = []
    all_account_ids_encoded = []  # To store encoded Account IDs #CHANGED
    for account_id, group in account_groups:
        account_data = group.sort_index()  # Sort by time, assuming df index is time-ordered
        group = group.drop(account_id_col, axis=1, errors='ignore') # Drop AccountID before converting to values
        num_features = group.shape[1]
        account_ids = account_data[account_id_col].values[sequence_length-1:]  # Get AccountIDs for the last transaction in each sequence
        account_ids_encoded = df['AccountID_encoded'][account_data.index[sequence_length-1:]].values # Get encoded IDs #CHANGED
        for i in range(len(account_data) - sequence_length):
            sequence = group.iloc[i:i + sequence_length].values  # Exclude AccountID
            data_sequences.append(sequence)
            account_id_sequences.append(account_ids[i])
            all_account_ids_encoded.append(account_ids_encoded[i]) #CHANGED
        
    return np.array(data_sequences), np.array(account_id_sequences), np.array(all_account_ids_encoded) #CHANGED


In [36]:
sequence_length = 24  # Example sequence length
data_sequences, account_id_sequences, all_account_ids = create_sequences(processed_df, sequence_length, account_id_col='AccountID_encoded') #CHANGED

In [37]:
data_sequences.shape

(216600, 24, 86)

In [38]:
# Determine vocabulary size for AccountID embedding
num_unique_accounts = processed_df['AccountID_encoded'].nunique()
num_unique_accounts

15

In [39]:
# 6. Split data into training and testing sets, ensuring no account overlap
unique_account_ids = np.unique(account_id_sequences)
train_accounts, test_accounts = train_test_split(unique_account_ids, test_size=0.2, random_state=42)

In [40]:
train_indices = np.isin(account_id_sequences, train_accounts)
test_indices = np.isin(account_id_sequences, test_accounts)
X_train, y_train_account = data_sequences[train_indices], account_id_sequences[train_indices]
X_test, y_test_account = data_sequences[test_indices], account_id_sequences[test_indices]


In [41]:
print(f"Train & test indices shapes: {train_indices.shape}, {test_indices.shape}")
print(f"process df shape: {processed_df['AccountID_encoded'].values.shape}")

Train & test indices shapes: (216600,), (216600,)
process df shape: (216960,)


In [42]:
# Prepare separate AccountID input for the embedding layer
X_train_account_encoded = all_account_ids[train_indices] #CHANGED
X_test_account_encoded = all_account_ids[test_indices] #CHANGED

In [43]:
print(X_train_account_encoded.shape, X_test_account_encoded.shape)
X_train_account_encoded
print(type(X_train), X_train.dtype)
print(type(X_train_account_encoded), X_train_account_encoded.dtype)

(173499,) (43101,)
<class 'numpy.ndarray'> float64
<class 'numpy.ndarray'> float64


In [None]:
# 7. Build the LSTM Autoencoder Model with Embeddings
embedding_dim = 64
n_features = X_train.shape[2]

print(f"Number of features: {n_features}")

# Input layers
input_data = Input(shape=(sequence_length, n_features), name='data_input')
print(f"Input shape: {input_data.shape}")
input_account = Input(shape=(1,), name='account_embedding_input')
print(f"Account input shape: {input_account.shape}")

# AccountID Embedding
embedding_layer = Embedding(input_dim=num_unique_accounts, output_dim=embedding_dim, name='account_embedding')(input_account)
print(f"Embedding layer shape: {embedding_layer.shape}")
flatten_embedding = Flatten(name='account_embedding_flatten')(embedding_layer)
print(f"Flattened embedding shape: {flatten_embedding.shape}")

# Align shape with LSTM input (batch_size, sequence_length, embedding_dim). This will repeat the account id embedding across all the dimensions of the sequence length
repeated_embedding = RepeatVector(name='account_embedding_repeated')(flatten_embedding)  # (batch_size, sequence_length, embedding_dim)
print(f"Repeated embedding shape: {repeated_embedding.shape}")

# Concatenate data features and account embedding
merged_input = Concatenate(axis=-1, name='merged_input')( [input_data, repeated_embedding])  # Shape should be (None, sequence_length, n_features + embedding_dim)
print(f"Merged input shape: {merged_input.shape}")

# Encoder
lstm1 = LSTM(128, activation='relu', name='encoder_lstm1')(merged_input)
dropout1 = Dropout(0.2, name='encoder_dropout1')(lstm1)
encoded = Dense(64, activation='relu', name='encoder_output')(dropout1)

# Decoder
repeated_encoded = RepeatVector(name='decoder_repeat_vector')(encoded) # Added RepeatVector
lstm2 = LSTM(128, activation='relu', return_sequences=True, name='decoder_lstm2')(repeated_encoded)
dropout2 = Dropout(0.2, name='decoder_dropout2')(lstm2)
decoded_output = TimeDistributed(Dense(n_features, name='timedist_decoder_output'))(dropout2)

# Autoencoder model
autoencoder = Model(inputs=[input_data, input_account], outputs=decoded_output)
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.summary()

Number of features: 86
Input shape: (None, 24, 86)
Account input shape: (None, 1)
Embedding layer shape: (None, 1, 64)
Flattened embedding shape: (None, 64)
Repeated embedding shape: (None, 24, 64)
Merged input shape: (None, 24, 150)


In [49]:
# 8. Train the Model
epochs = 10
batch_size = 64

# Define early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',  # Monitor validation loss
    patience=5,         # Stop after 5 epochs of no improvement
    restore_best_weights=True,  # Restore the best weights
    verbose=1
)

# Include tensorboard callback
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1, write_graph=True)

In [50]:
print(X_train.shape, X_test.shape)                 # Should be (num_samples, sequence_length, num_features)
print(X_train_account_encoded.shape, X_test_account_encoded.shape) # Should be (num_samples, 1)
print(X_train_account_encoded.dtype, X_test_account_encoded.dtype) # Should be int32 or int64

(173499, 24, 86) (43101, 24, 86)
(173499,) (43101,)
float64 float64


In [None]:
history = autoencoder.fit(
    [X_train, X_train_account_encoded],  # List of inputs
    X_train,                             # Target is the same as input (autoencoder)
    epochs=epochs,
    batch_size=batch_size,
    validation_data=([X_test, X_test_account_encoded], X_test),
    callbacks=[early_stopping, tensorboard_callback],
    verbose=1
)

Epoch 1/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 53ms/step - loss: 0.0489 - val_loss: 0.0447
Epoch 2/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 47ms/step - loss: 0.0488 - val_loss: 0.0446
Epoch 3/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 42ms/step - loss: 0.0487 - val_loss: 0.0443
Epoch 4/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 40ms/step - loss: 0.0485 - val_loss: 0.0442
Epoch 5/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 40ms/step - loss: 0.0483 - val_loss: 0.0441
Epoch 6/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 42ms/step - loss: 0.0483 - val_loss: 0.0438
Epoch 7/10
[1m2711/2711[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 39ms/step - loss: 0.0481 - val_loss: 0.0435
Epoch 8/10
[1m1648/2711[0m [32m━━━━━━━━━━━━[0m[37m━━━━━━━━[0m [1m39s[0m 37ms/step - loss: 0.0480

In [None]:
from tensorflow.keras.utils import plot_model
plot_model(autoencoder, to_file='model_architecture.png', show_shapes=True, show_layer_names=True)

In [None]:
! pip install shap
import shap

explainer = shap.DeepExplainer(model, background_data)  # background_data is a representative sample
shap_values = explainer.shap_values(X_test)

# Visualize summary plot for features (aggregate over time steps)
shap.summary_plot(shap_values, features=X_test)


In [None]:
fin_data_df.head()

In [None]:
fin_data_df.plot(y='Amount', kind='line', figsize=(15, 5), title='Transaction Amount Over Time')

In [None]:
fin_data_df.info()