In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

# Load dataset
data = pd.read_csv('abm.csv')  # Replace with your dataset path

# Fill missing values
data['province'] = data['province'].fillna('unknown')
data['city'] = data['city'].fillna('unknown')

# Encode categorical variables
categorical_columns = ['debit_credit', 'cash_indicator', 'country', 'province', 'city']
for col in categorical_columns:
    data[col] = LabelEncoder().fit_transform(data[col])

# Normalize numerical variables
scaler = MinMaxScaler()
data['amount_cad'] = scaler.fit_transform(data[['amount_cad']])

# Combine transaction_date and transaction_time into a single datetime feature
data['transaction_datetime'] = pd.to_datetime(data['transaction_date'] + ' ' + data['transaction_time'])
data = data.sort_values(by=['customer_id', 'transaction_datetime'])

# Drop unused columns
data = data.drop(columns=['abm_id', 'transaction_date', 'transaction_time', 'transaction_datetime'])

# Group by customer_id and create sequences
sequence_length = 10
grouped = data.groupby('customer_id')
sequences = []
for customer_id, group in grouped:
    group = group.drop(columns=['customer_id']).values
    for i in range(len(group) - sequence_length + 1):
        sequences.append(group[i:i+sequence_length])
sequences = np.array(sequences)

# Train-test split
X_train, X_test = train_test_split(sequences, test_size=0.2, random_state=42)

# LSTM Autoencoder Model
model = tf.keras.Sequential([
    tf.keras.layers.LSTM(64, activation='relu', input_shape=(sequence_length, X_train.shape[2]), return_sequences=True),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=False),
    tf.keras.layers.RepeatVector(sequence_length),
    tf.keras.layers.LSTM(32, activation='relu', return_sequences=True),
    tf.keras.layers.LSTM(64, activation='relu', return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(X_train.shape[2]))
])
model.compile(optimizer='adam', loss='mse')
model.summary()

# Train the model
history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2, shuffle=True)

# Reconstruction errors
X_test_pred = model.predict(X_test)
test_loss = np.mean(np.power(X_test - X_test_pred, 2), axis=(1, 2))

# Anomaly detection threshold
threshold = np.percentile(test_loss, 95)  # Adjust as needed
anomalies = test_loss > threshold

print(f"Threshold: {threshold}")
print(f"Number of anomalies detected: {np.sum(anomalies)}")


  super().__init__(**kwargs)


Epoch 1/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 27ms/step - loss: 269.0851 - val_loss: 78.1304
Epoch 2/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - loss: 76.0061 - val_loss: 66.5273
Epoch 3/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 67.3571 - val_loss: 62.3777
Epoch 4/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - loss: 57.1855 - val_loss: 47.0491
Epoch 5/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 46.6031 - val_loss: 37.7296
Epoch 6/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - loss: 38.5995 - val_loss: 33.8288
Epoch 7/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 32.3846 - val_loss: 28.3625
Epoch 8/20
[1m138/138[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - loss: 28.8543 - val_loss: 25.9347
Epoch 9/20
[1