# Data Exploration

## Setup

### Downloading Librabies

In [8]:
%pip install pandas
%pip install spacy
%pip install nltk
%pip install scikit-learn
%pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.17.0 requires numpy<2.0.0,>=1.23.5; python_version <= "3.11", but you have numpy 2.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.0.2
Note: you may need to restart the kernel to use updated pac

### Importing Librabies

In [2]:
import pandas as pd
import re

## Reading Data

### Read Transactional Data and Non statistical exploration

Reading data and renaming columns as well as dropping uneeded columns

In [3]:
transactional_data = pd.read_csv("../DataSets/TransactionalData/Synthetic_Financial_datasets_log.csv")

transactional_data['nameOrig'] = transactional_data['nameOrig'].str[:3]
transactional_data['nameDest'] = transactional_data['nameDest'].apply(lambda x: x[:4] if x.startswith('M') else x)

print("Fraud: " + str(len(transactional_data[transactional_data['isFraud'] == 1])) + "; Fraud: " + str(len(transactional_data[transactional_data['isFraud'] == 0])))

unique_users_count = transactional_data['nameOrig'].nunique()
unique_merch_count = transactional_data[transactional_data['nameDest'].str.startswith('M')]['nameDest'].nunique()

print(f'Total distinct users: {unique_users_count}')
print(f'Total distinct merch: {unique_merch_count}')

user_transaction_count = transactional_data.groupby('nameOrig').size().reset_index(name='transaction_count')

# Sort by the number of transactions in descending order
sorted_users = user_transaction_count.sort_values(by='transaction_count', ascending=False)

top_10_users = sorted_users.head(10)
print(top_10_users)
#del transactional_data['oldbalanceDest']
#del transactional_data['newbalanceDest']



# Assuming your simulation starts at a specific date and time
start_datetime = '2023-01-01 00:00:00'  # Replace with your actual start date and time
steps = transactional_data['step'].max()  # Total simulation steps (hours)

# Create a date range for the number of steps, incrementing by 1 hour
date_range = pd.date_range(start=start_datetime, periods=steps, freq='1d')

# Total number of records
total_records = len(transactional_data)


# Assign datetime based on the step
transactional_data['transaction_date'] = transactional_data['step'].apply(lambda x: date_range[x-1])

# Display the first few rows of the DataFrame
print(transactional_data.head())

Fraud: 8213; Fraud: 6354407
Total distinct users: 90
Total distinct merch: 900
   nameOrig  transaction_count
9       C19             329946
3       C13             329826
5       C15             329675
7       C17             329263
8       C18             329263
1       C11             329094
10      C20             329079
6       C16             328934
4       C14             328476
2       C12             328381
   step      type    amount nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64      C12       170136.0       160296.36   
1     1   PAYMENT   1864.28      C16        21249.0        19384.72   
2     1  TRANSFER    181.00      C13          181.0            0.00   
3     1  CASH_OUT    181.00      C84          181.0            0.00   
4     1   PAYMENT  11668.14      C20        41554.0        29885.86   

     nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  \
0        M197             0.0             0.0        0               0   
1    

Insure all the data is in correct format for exploration

In [4]:
import numpy as np
chars_to_remove = [' ', ',']

for char in chars_to_remove:
    transactional_data['amount'] = transactional_data['amount'].replace(char, '', regex=True)
    transactional_data['oldbalanceOrg'] = transactional_data['oldbalanceOrg'].replace(char, '', regex=True)
    transactional_data['newbalanceOrig'] = transactional_data['newbalanceOrig'].replace(char, '', regex=True)

transactional_data['amount'] = transactional_data['amount'].astype(np.float32)
transactional_data['oldbalanceOrg'] = transactional_data['oldbalanceOrg'].astype(np.float32)
transactional_data['newbalanceOrig'] = transactional_data['newbalanceOrig'].astype(np.float32)

sorted_data = data = transactional_data.sort_values(by=['nameOrig', 'transaction_date'])

In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

df = transactional_data[transactional_data['isFraud'] == 0]

# Step 4: Convert `transaction_date` to datetime and extract day, month, and day of week as features
df.loc[:, 'transaction_date'] = pd.to_datetime(df['transaction_date'], format='%d-%b-%y')
df.loc[:, 'day_of_week'] = df['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
df.loc[:, 'month'] = df['transaction_date'].dt.month

# Step 5: Encode categorical variables (transaction_type and merchant)
le_merchant = LabelEncoder()
df.loc[:, 'nameDest'] = le_merchant.fit_transform(df['nameDest'].values)

le_transaction_type = LabelEncoder()
df.loc[:, 'type'] = le_transaction_type.fit_transform(df['type'].values)

print(df[['amount', 'oldbalanceOrg', 'newbalanceOrig']])
standard_scaler = StandardScaler()
df[['amount', 'oldbalanceOrg', 'newbalanceOrig']] = standard_scaler.fit_transform(df[['amount', 'oldbalanceOrg', 'newbalanceOrig']])

print(df[['amount', 'oldbalanceOrg', 'newbalanceOrig']])
print(df[['amount', 'oldbalanceOrg', 'newbalanceOrig']].dtypes)
print(df[['amount', 'oldbalanceOrg', 'newbalanceOrig']].isna().sum())

# Step 3: Prepare data for LSTM
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        sequences.append(seq)
    return np.array(sequences)

# Select features for LSTM
features = df[['amount', 'nameDest', 'oldbalanceOrg', 'newbalanceOrig', 'nameDest', 'type', 'step', 'isFlaggedFraud']].values
SEQ_LENGTH = 50
X = create_sequences(features, SEQ_LENGTH)

# Verify the shapes of the output
print("Shape of X (sequences):", X.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'day_of_week'] = df['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'month'] = df['transaction_date'].dt.month


                amount  oldbalanceOrg  newbalanceOrig
0          9839.639648       170136.0   160296.359375
1          1864.280029        21249.0    19384.720703
4         11668.139648        41554.0    29885.859375
5          7817.709961        53860.0    46042.289062
6          7107.770020       183195.0   176087.234375
...                ...            ...             ...
6362319    8634.290039       518802.0   510167.718750
6362320  159188.218750         3859.0        0.000000
6362321  186273.843750       168046.0        0.000000
6362322   82096.453125        13492.0        0.000000
6362323    1864.239990        20426.0    18561.759766

[6354407 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['amount', 'oldbalanceOrg', 'newbalanceOrig']] = standard_scaler.fit_transform(df[['amount', 'oldbalanceOrg', 'newbalanceOrig']])


           amount  oldbalanceOrg  newbalanceOrig
0       -0.282367      -0.229532       -0.237838
1       -0.295743      -0.281101       -0.286013
4       -0.279300      -0.274068       -0.282423
5       -0.285758      -0.269806       -0.276900
6       -0.286948      -0.225009       -0.232440
...           ...            ...             ...
6362319 -0.284388      -0.108767       -0.118224
6362320 -0.031881      -0.287124       -0.292641
6362321  0.013546      -0.230256       -0.292641
6362322 -0.161179      -0.283788       -0.292641
6362323 -0.295743      -0.281386       -0.286295

[6354407 rows x 3 columns]
amount            float32
oldbalanceOrg     float32
newbalanceOrig    float32
dtype: object
amount            0
oldbalanceOrg     0
newbalanceOrig    0
dtype: int64
Shape of X (sequences): (6354357, 50, 8)


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler


# Define LSTM Autoencoder structure
def create_lstm_autoencoder(timesteps, n_features, latent_dim):
    # Output
    inputs = Input(shape=(timesteps, n_features))

    # Encoder
    encoded = LSTM(128, activation='relu', return_sequences=True)(inputs)
    encoded = LSTM(64, activation='relu', return_sequences=False)(encoded)

    latent = Dense(latent_dim, activation='relu')(encoded)

    # Decoder
    decoded = RepeatVector(timesteps)(latent)
    decoded = LSTM(64, activation='relu', return_sequences=True)(decoded)
    decoded = LSTM(128, activation='relu', return_sequences=True)(decoded)
    output = TimeDistributed(Dense(n_features))(decoded)

    # Full autoencoder model
    autoencoder = Model(inputs, output)
    return autoencoder



In [8]:
# Instantiate and compile the model
latent_dim = 32
global_autoencoder = create_lstm_autoencoder(SEQ_LENGTH, X.shape[2], latent_dim)
global_autoencoder.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# If using a Pandas DataFrame, convert the DataFrame to a NumPy array and ensure numeric types
X = X.astype(np.float32)

# Assuming 'global_transaction_data' is the dataset for general training (preprocesse
# Train the global model
global_autoencoder.fit(X, X, epochs=5, batch_size=128, validation_split=0.2)

Epoch 1/5
[1m 1315/39715[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m30:24:37[0m 3s/step - loss: nan

KeyboardInterrupt: 

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from keras import layers

# Load your DataFrame here (e.g., df = pd.read_csv('your_data.csv')) 
df = transactional_data[transactional_data['isFraud'] == 0]

# Step 1: TF-IDF Vectorization for transaction details
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Step 2: Handle missing values

# Step 3: Convert `transaction_date` to datetime and extract features
df['transaction_date'] = pd.to_datetime(df['transaction_date'], format='%d-%b-%y')
df['day_of_week'] = df['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
df['month'] = df['transaction_date'].dt.month

# Step 4: Encode categorical variables
le_merchant = LabelEncoder()
df['nameDest'] = le_merchant.fit_transform(df['nameDest'])

le_transaction_type = LabelEncoder()
df['type'] = le_transaction_type.fit_transform(df['type'])

# Step 5: Combine features into a single dataset
features = ['type', 'amount', 'oldbalanceOrg', 'day_of_week', 'month', 'nameDest', 'isFlaggedFraud', 'isFlaggedFraud' ]
X = df[features].values  # Convert to NumPy array
#X_tfidf = tfidf_df.values  # Convert TF-IDF DataFrame to NumPy array
#X_combined = np.hstack((X, X_tfidf))  # Combine features

# Step 6: Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the data into training and testing sets
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

# Step 8: Define the Autoencoder model
input_dim = X_train.shape[1]  # This should be 1007
print("Input dimension for Autoencoder:", input_dim)

autoencoder = keras.Sequential([
    layers.Input(shape=(input_dim,)),  # Input layer
    layers.Dense(64, activation='relu'),  # Encoder layer
    layers.Dense(32, activation='relu'),  # Bottleneck layer
    layers.Dense(64, activation='relu'),  # Decoder layer
    layers.Dense(input_dim, activation='sigmoid')  # Output layer
])

# Step 9: Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Step 10: Train the Autoencoder
autoencoder.fit(X_train, X_train, epochs=5, batch_size=16, validation_split=0.1, verbose=1)

# Step 11: Make predictions and calculate reconstruction error
reconstructed = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructed, 2), axis=1)
threshold = np.percentile(mse, 95)  # Define a threshold for anomaly detection

# Step 12: Identify anomalies
anomalies = mse > threshold

# Step 13: Output results
#df_test = pd.DataFrame(X_test, columns=[*features, *tfidf_vectorizer.get_feature_names_out()])
df_test = pd.DataFrame(X_test, columns=[*features])
df_test['mse'] = mse
df_test['anomaly'] = anomalies

print(df_test[['mse', 'anomaly']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['transaction_date'] = pd.to_datetime(df['transaction_date'], format='%d-%b-%y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['day_of_week'] = df['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['month'] = df['transaction_date'].dt.mont

Input dimension for Autoencoder: 8
Epoch 1/5
[1m285949/285949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 525us/step - loss: 0.4569 - val_loss: 0.4605
Epoch 2/5
[1m285949/285949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 550us/step - loss: 0.4569 - val_loss: 0.4604
Epoch 3/5
[1m285949/285949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 624us/step - loss: 0.4549 - val_loss: 0.4604
Epoch 4/5
[1m285949/285949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 572us/step - loss: 0.4529 - val_loss: 0.4604
Epoch 5/5
[1m285949/285949[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m157s[0m 550us/step - loss: 0.4552 - val_loss: 0.4604
[1m39716/39716[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 387us/step
              mse  anomaly
0        0.308623    False
1        0.069563    False
2        0.053525    False
3        0.057247    False
4        0.269340    False
...           ...      ...
1270877  0.147428    False
1270878  1.510477     

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model

# Step 1: Load the original dataset and fine-tuned user data
# Assuming 'df' is your original dataset and 'user_df' is the DataFrame for the specific user
# Example user_df could be something like:
user_df = df[df['nameOrig'] == 'C1603']

print(len(user_df))
# Step 5: Combine features into a single dataset
features = ['type', 'amount', 'oldbalanceOrg', 'day_of_week', 'month', 'nameDest', 'isFlaggedFraud' ]
user_X = user_df[features].values

# Step 3: Load the trained model

# Step 4: Fine-tune the model on user data
# You can train with a lower learning rate or additional epochs
autoencoder.fit(user_X, user_X, epochs=10, batch_size=16, validation_split=0.1, verbose=1)

# Step 5: Evaluate the fine-tuned model on user data
user_reconstructed = autoencoder.predict(user_X)
user_mse = np.mean(np.power(user_X - user_reconstructed, 2), axis=1)

# Adding MSE to user DataFrame
user_df['mse'] = user_mse
user_df['anomaly'] = user_df['mse'] > threshold  # Set a threshold based on your previous evaluation

# Step 6: Analyze results
print(user_df[['transaction_date', 'mse', 'anomaly']])

In [None]:
# Function to detect anomalies
def detect_anomalies(data, model, threshold):
    # Scale the input data
    data_scaled = scaler.transform(data)
    # Get reconstructed data
    reconstructed = model.predict(data_scaled)
    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(data_scaled - reconstructed), axis=1)
    
    # Identify anomalies
    anomalies = reconstruction_errors > threshold
    return anomalies, reconstruction_errors

# Set a threshold for anomalies (could be determined based on validation set)
threshold = 0.05  # Adjust this based on your validation results

# Use the global model to detect anomalies in the entire dataset
anomalies_global, reconstruction_errors_global = detect_anomalies(X_scaled, autoencoder, threshold)

# Use the fine-tuned model to detect anomalies in user-specific data
anomalies_user, reconstruction_errors_user = detect_anomalies(X_user_scaled, autoencoder, threshold)

# Add results to the DataFrame for better understanding
df['anomaly_global'] = anomalies_global
df['anomaly_user'] = np.nan  # Placeholder for user-specific anomalies
df.loc[df['user'] == 'user_1', 'anomaly_user'] = anomalies_user

# View the results
print(df.head(10))

exploration of users