In [7]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, RepeatVector, TimeDistributed
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Load ABM dataset
file_path = '/Users/mac/Desktop/abm.csv'
data = pd.read_csv(file_path)

# =================== Step 1: Data Preprocessing ===================
# Fill missing values
data['country'].fillna('Unknown', inplace=True)
data['province'].fillna('Unknown', inplace=True)
data['city'].fillna('Unknown', inplace=True)

# Convert time and date to a single datetime column
data['datetime'] = pd.to_datetime(data['transaction_date'] + ' ' + data['transaction_time'])
data = data.sort_values(by='datetime')

# Normalize numeric columns
scaler = MinMaxScaler()
data['amount_cad'] = scaler.fit_transform(data[['amount_cad']])

# Encode categorical variables (e.g., debit_credit, cash_indicator)
data = pd.get_dummies(data, columns=['debit_credit', 'cash_indicator'])

# Dynamically identify dummy-encoded column names
encoded_columns = [col for col in data.columns if col.startswith('debit_credit') or col.startswith('cash_indicator')]

# Sequence preparation (group transactions by customer_id and create time-series data)
sequence_data = []
customer_ids = data['customer_id'].unique()

for customer_id in customer_ids:
    customer_data = data[data['customer_id'] == customer_id]
    sequence = customer_data[['amount_cad'] + encoded_columns].values
    if len(sequence) > 10:  # Only include sequences longer than 10
        sequence_data.append(sequence)

# Pad sequences to the same length
max_seq_len = max(len(seq) for seq in sequence_data)
sequence_data = tf.keras.preprocessing.sequence.pad_sequences(sequence_data, maxlen=max_seq_len, dtype='float32')

# Split data into training and testing sets
X_train, X_test = train_test_split(sequence_data, test_size=0.2, random_state=42)

# =================== Step 2: GRU Model Architecture ===================
model = Sequential([
    GRU(64, activation='relu', input_shape=(max_seq_len, X_train.shape[2]), return_sequences=True),
    GRU(32, activation='relu', return_sequences=False),
    RepeatVector(max_seq_len),
    GRU(32, activation='relu', return_sequences=True),
    GRU(64, activation='relu', return_sequences=True),
    TimeDistributed(Dense(X_train.shape[2]))
])

model.compile(optimizer='adam', loss='mse')
model.summary()

# =================== Step 3: Training ===================
# Train the GRU autoencoder on normal sequences (no anomalies)
history = model.fit(X_train, X_train, epochs=50, batch_size=32, validation_split=0.1, shuffle=True)

# =================== Step 4: Anomaly Detection ===================
# Compute reconstruction error on test data
def detect_anomalies(model, data, threshold=None):
    reconstructions = model.predict(data)
    reconstruction_errors = np.mean(np.square(reconstructions - data), axis=(1, 2))
    
    if threshold is None:
        threshold = np.percentile(reconstruction_errors, 95)  # Set threshold at the 95th percentile

    anomalies = reconstruction_errors > threshold
    return anomalies, reconstruction_errors, threshold

# Detect anomalies
anomalies, errors, threshold = detect_anomalies(model, X_test)

# Output anomaly statistics
print(f"Threshold for anomaly detection: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)} out of {len(X_test)}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['country'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['province'].fillna('Unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting val

Epoch 1/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 648ms/step - loss: 0.0326 - val_loss: 0.0341
Epoch 2/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 362ms/step - loss: 0.0325 - val_loss: 0.0339
Epoch 3/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 361ms/step - loss: 0.0309 - val_loss: 0.0339
Epoch 4/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 358ms/step - loss: 0.0319 - val_loss: 0.0338
Epoch 5/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 374ms/step - loss: 0.0308 - val_loss: 0.0338
Epoch 6/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 360ms/step - loss: 0.0328 - val_loss: 0.0338
Epoch 7/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 364ms/step - loss: 0.0332 - val_loss: 0.0339
Epoch 8/50
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 361ms/step - loss: 0.0320 - val_loss: 0.0338
Epoch 9/50
[1m12/12[0m [32m━━━━━━━━━