# Data Exploration

## Setup

### Downloading Librabies

In [1]:
%pip install pandas
%pip install spacy
%pip install nltk
%pip install scikit-learn
%pip install tensorflow

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting numpy>=1.19.0 (from spacy)
  Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl.metadata (60 kB)
Using cached numpy-2.0.2-cp39-cp39-macosx_14_0_arm64.whl (5.3 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.17.0 requires numpy<2.0.0,>=1.23.5; python_version <= "3.11", but you have numpy 2.0.2 which is incompatible.[0m[31m
[0mSuccessfully installed numpy-2.0.2
Note: you may need to restart the kernel to use updated pac

### Importing Librabies

In [2]:
import pandas as pd
import re

## Reading Data

### Read Transactional Data and Non statistical exploration

Reading data and renaming columns as well as dropping uneeded columns

In [3]:
transactional_data = pd.read_csv("../DataSets/TransactionalData/bank.csv")

del transactional_data['CHQ.NO.']
del transactional_data['VALUE DATE']
del transactional_data['.']
#del transactional_data[' DEPOSIT AMT ']

transactional_data = transactional_data.rename(columns={
    "Account No": "user",
    "DATE": "transaction_date",
    "TRANSACTION DETAILS": "transaction_details",
    " WITHDRAWAL AMT ": "money_out",
    " DEPOSIT AMT ": "money_in",
    "BALANCE AMT": "rolling_balance"
})

Insure all the data is in correct format for exploration

In [4]:
chars_to_remove = [' ', ',']

for char in chars_to_remove:
    transactional_data['money_in'] = transactional_data['money_in'].replace(char, '', regex=True)
    transactional_data['money_out'] = transactional_data['money_out'].replace(char, '', regex=True)
    transactional_data['rolling_balance'] = transactional_data['rolling_balance'].replace(char, '', regex=True)

transactional_data['money_in'] = transactional_data['money_in'].astype(float)
transactional_data['money_out'] = transactional_data['money_out'].astype(float)
transactional_data['rolling_balance'] = transactional_data['rolling_balance'].astype(float)

sorted_data = data = transactional_data.sort_values(by=['user', 'transaction_date'])

rename users and get rid of senstive data

In [5]:
# Get the unique users
unique_users = transactional_data['user'].unique()

# Create a mapping from old usernames to p_1
user_mapping = {old_user: f'p_{i}' for i, old_user in enumerate(unique_users, start=1)}

# Replace the old usernames in the 'users' column with the new usernames
transactional_data['user'] = transactional_data['user'].map(user_mapping)

# focus is on money going out of the account
#transactional_data = transactional_data[transactional_data['money_out'].notna()]
len(transactional_data[transactional_data['transaction_details'].str.contains('transfe', case=False, na=False)])

12174

In [6]:
# Infer transaction types

def extract_transaction_type(details):
    details = details.lower()
    if 'deposit' in details:
        return "deposit"
    elif 'cashdep' in details:
        return "cash_deposit"
    elif 'transfe' or 'trf' in details:
        return "transfer"
    elif 'eft' in details:
        return "eft"
    elif 'payment' in details:
        return "payment"
    elif 'fees' in details :
        return "fees"
    elif 'pos' in details :
        return "card_purchase"
    else:
        return "other"
    
merchants = []

def extract_merchant(details, merchants_list):
    match = re.search(r'\b(?:To|From)\s+([A-Za-z ]+)', details, re.IGNORECASE)
    
    if match:
        merchant = match.group(1).strip()  # Extract merchant name
        if merchant not in merchants_list:
            merchants_list.append(merchant)  # Add to merchant list dynamically


transactional_data['transaction_type'] = transactional_data.apply(lambda row: extract_transaction_type(str(row['transaction_details'])), axis=1)

extract_merchant(str(transactional_data['transaction_details']), merchants)

# Sort merchants by length (longest first)
merchants_sorted = sorted(merchants, key=len, reverse=True)

# Now, iterate over the sorted list of merchants for the second pass
def refine_merchant(details, merchants_list):
    for merchant in merchants_list:
        if merchant.lower() in details.lower():  # Case-insensitive match
            return merchant
    return 'Unknown'

# Second pass: Apply the function again using the sorted merchant list
transactional_data['merchant'] = transactional_data['transaction_details'].apply(lambda row: refine_merchant(str(row), merchants_sorted))

len(transactional_data[transactional_data['merchant'].str.contains('unknown', case=False, na=False)]) 

#len(transactional_data)

transactional_data

Unnamed: 0,user,transaction_date,transaction_details,money_out,money_in,rolling_balance,transaction_type,merchant
0,p_1,29-Jun-17,TRF FROM Indiaforensic SERVICES,,1000000.0,1.000000e+06,transfer,Indiaforensic SERVICES
1,p_1,5-Jul-17,TRF FROM Indiaforensic SERVICES,,1000000.0,2.000000e+06,transfer,Indiaforensic SERVICES
2,p_1,18-Jul-17,FDRL/INTERNAL FUND TRANSFE,,500000.0,2.500000e+06,transfer,Unknown
3,p_1,1-Aug-17,TRF FRM Indiaforensic SERVICES,,3000000.0,5.500000e+06,transfer,Indiaforensic SERVICES
4,p_1,16-Aug-17,FDRL/INTERNAL FUND TRANSFE,,500000.0,6.000000e+06,transfer,Unknown
...,...,...,...,...,...,...,...,...
116196,p_10,5-Mar-19,TRF TO 1196428 Indiaforensic SE,117934.30,,-1.901902e+09,transfer,Unknown
116197,p_10,5-Mar-19,FDRL/INTERNAL FUND TRANSFE,,300000.0,-1.901602e+09,transfer,Unknown
116198,p_10,5-Mar-19,FDRL/INTERNAL FUND TRANSFE,,300000.0,-1.901302e+09,transfer,Unknown
116199,p_10,5-Mar-19,IMPS 05-03-20194C,109868.65,,-1.901412e+09,transfer,Unknown


In [7]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Assuming you have nltk installed and downloaded stopwords and wordnet
# !pip install nltk
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Lowercase
    text = str(text).lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenization
    tokens = text.split()
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)


df = transactional_data

# Example usage
df['transaction_details'] = df['transaction_details'].apply(preprocess_text)

df

Unnamed: 0,user,transaction_date,transaction_details,money_out,money_in,rolling_balance,transaction_type,merchant
0,p_1,29-Jun-17,trf indiaforensic service,,1000000.0,1.000000e+06,transfer,Indiaforensic SERVICES
1,p_1,5-Jul-17,trf indiaforensic service,,1000000.0,2.000000e+06,transfer,Indiaforensic SERVICES
2,p_1,18-Jul-17,fdrlinternal fund transfe,,500000.0,2.500000e+06,transfer,Unknown
3,p_1,1-Aug-17,trf frm indiaforensic service,,3000000.0,5.500000e+06,transfer,Indiaforensic SERVICES
4,p_1,16-Aug-17,fdrlinternal fund transfe,,500000.0,6.000000e+06,transfer,Unknown
...,...,...,...,...,...,...,...,...
116196,p_10,5-Mar-19,trf indiaforensic se,117934.30,,-1.901902e+09,transfer,Unknown
116197,p_10,5-Mar-19,fdrlinternal fund transfe,,300000.0,-1.901602e+09,transfer,Unknown
116198,p_10,5-Mar-19,fdrlinternal fund transfe,,300000.0,-1.901302e+09,transfer,Unknown
116199,p_10,5-Mar-19,imp c,109868.65,,-1.901412e+09,transfer,Unknown


In [8]:
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import IsolationForest
# from sklearn.preprocessing import LabelEncoder
# from sklearn.feature_extraction.text import TfidfVectorizer
# from scipy.sparse import hstack
 
# # Step 1: Initialize the TF-IDF Vectorizer
# tfidf_vectorizer = TfidfVectorizer(max_features=1000)
# X_tfidf_dense = tfidf_vectorizer.fit_transform(df['transaction_details']).toarray()
# tfidf_df = pd.DataFrame(X_tfidf_dense, columns=tfidf_vectorizer.get_feature_names_out())


# # Step 3: Handle missing values
# df['money_out'] = df['money_out'].fillna(0)
# df['money_in'] = df['money_in'].fillna(0)

# # Step 4: Convert `transaction_date` to datetime and extract day, month, and day of week as features
# df['transaction_date'] = pd.to_datetime(df['transaction_date'], format='%d-%b-%y')
# df['day_of_week'] = df['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
# df['month'] = df['transaction_date'].dt.month

# # Step 5: Encode categorical variables (transaction_type and merchant)
# le_merchant = LabelEncoder()
# df['merchant_encoded'] = le_merchant.fit_transform(df['merchant'].values)

# le_transaction_type = LabelEncoder()
# df['transaction_type_encoded'] = le_transaction_type.fit_transform(df['transaction_type'].values)

# # Step 2: Concatenate the original DataFrame with the TF-IDF DataFrame
# # Resetting index to avoid misalignment
# df = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# # Step 6: Select relevant features for Isolation Forest
# # Combine TF-IDF features with numerical and categorical features
# features = ['money_out', 'money_in', 'rolling_balance', 'day_of_week', 'month', 'merchant_encoded', 'transaction_type_encoded']
# X = df[features].values  # Convert to NumPy array for consistency
# X_tfidf = tfidf_df.values  # Convert TF-IDF DataFrame to NumPy array

# # Combine numerical features with TF-IDF features
# X_combined = np.hstack((X, X_tfidf))

# # Step 7: Fit Isolation Forest
# iso_forest = IsolationForest(contamination=0.05, random_state=42)
# df['fraud_prediction'] = iso_forest.fit_predict(X_combined)

# # Step 8: Interpret results
# print(df[['transaction_date', 'fraud_prediction']])
# fraud_counts = df['fraud_prediction'].value_counts()

# # Print the results in a readable format
# print(f"Normal transactions: {fraud_counts[1]}")
# print(f"Fraudulent transactions: {fraud_counts[-1]}")

# # Optionally, print details of potential fraudulent transactions
# fraudulent_transactions = df[df['fraud_prediction'] == -1]
# print(f"\nDetails of potential fraudulent transactions:\n{fraudulent_transactions}")


In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow import keras
from keras import layers

# Load your DataFrame here (e.g., df = pd.read_csv('your_data.csv'))

# Step 1: TF-IDF Vectorization for transaction details
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf_dense = tfidf_vectorizer.fit_transform(df['transaction_details']).toarray()
tfidf_df = pd.DataFrame(X_tfidf_dense, columns=tfidf_vectorizer.get_feature_names_out())

# Step 2: Handle missing values
df['money_out'] = df['money_out'].fillna(0)  # Fill NaNs for money_out
df['money_in'] = df['money_in'].fillna(0)    # Fill NaNs for money_in

# Step 3: Convert `transaction_date` to datetime and extract features
df['transaction_date'] = pd.to_datetime(df['transaction_date'], format='%d-%b-%y')
df['day_of_week'] = df['transaction_date'].dt.dayofweek  # Monday=0, Sunday=6
df['month'] = df['transaction_date'].dt.month

# Step 4: Encode categorical variables
le_merchant = LabelEncoder()
df['merchant_encoded'] = le_merchant.fit_transform(df['merchant'])

le_transaction_type = LabelEncoder()
df['transaction_type_encoded'] = le_transaction_type.fit_transform(df['transaction_type'])

# Step 5: Combine features into a single dataset
features = ['money_out', 'money_in', 'rolling_balance', 'day_of_week', 'month', 'merchant_encoded', 'transaction_type_encoded']
X = df[features].values  # Convert to NumPy array
X_tfidf = tfidf_df.values  # Convert TF-IDF DataFrame to NumPy array
X_combined = np.hstack((X, X_tfidf))  # Combine features

# Step 6: Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 7: Split the data into training and testing sets
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)

# Step 8: Define the Autoencoder model
input_dim = X_train.shape[1]  # This should be 1007
print("Input dimension for Autoencoder:", input_dim)

autoencoder = keras.Sequential([
    layers.Input(shape=(input_dim,)),  # Input layer
    layers.Dense(64, activation='relu'),  # Encoder layer
    layers.Dense(32, activation='relu'),  # Bottleneck layer
    layers.Dense(64, activation='relu'),  # Decoder layer
    layers.Dense(input_dim, activation='sigmoid')  # Output layer
])

# Step 9: Compile the model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Step 10: Train the Autoencoder
autoencoder.fit(X_train, X_train, epochs=5, batch_size=16, validation_split=0.1, verbose=1)

# Step 11: Make predictions and calculate reconstruction error
reconstructed = autoencoder.predict(X_test)
mse = np.mean(np.power(X_test - reconstructed, 2), axis=1)
threshold = np.percentile(mse, 95)  # Define a threshold for anomaly detection

# Step 12: Identify anomalies
anomalies = mse > threshold

# Step 13: Output results
#df_test = pd.DataFrame(X_test, columns=[*features, *tfidf_vectorizer.get_feature_names_out()])
df_test = pd.DataFrame(X_test, columns=[*features])
df_test['mse'] = mse
df_test['anomaly'] = anomalies

print(df_test[['mse', 'anomaly']])

Input dimension for Autoencoder: 7
Epoch 1/5
[1m5229/5229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 446us/step - loss: 0.7188 - val_loss: 0.6491
Epoch 2/5
[1m5229/5229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 444us/step - loss: 0.6853 - val_loss: 0.6489
Epoch 3/5
[1m5229/5229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 436us/step - loss: 0.7241 - val_loss: 0.6488
Epoch 4/5
[1m5229/5229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 445us/step - loss: 0.6392 - val_loss: 0.6498
Epoch 5/5
[1m5229/5229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 435us/step - loss: 0.7058 - val_loss: 0.6491
[1m727/727[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347us/step
            mse  anomaly
0      0.259718    False
1      1.140601    False
2      0.709279    False
3      2.809497     True
4      1.089082    False
...         ...      ...
23236  0.333858    False
23237  2.109684     True
23238  0.084932    False
23239  0.161488    Fals

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model

# Step 1: Load the original dataset and fine-tuned user data
# Assuming 'df' is your original dataset and 'user_df' is the DataFrame for the specific user
# Example user_df could be something like:
user_df = df[df['user'] == 'p_1']

# Step 2: Preprocess user data (same as before)
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Use the same vectorizer settings
user_X_tfidf_dense = tfidf_vectorizer.fit_transform(user_df['transaction_details']).toarray()
user_tfidf_df = pd.DataFrame(user_X_tfidf_dense, columns=tfidf_vectorizer.get_feature_names_out())

# Concatenate with original user DataFrame
user_df = pd.concat([user_df.reset_index(drop=True), user_tfidf_df.reset_index(drop=True)], axis=1)

# Encode categorical variables (using same label encoders)
user_df['merchant_encoded'] = le_merchant.transform(user_df['merchant'])
user_df['transaction_type_encoded'] = le_transaction_type.transform(user_df['transaction_type'])

# Select relevant features for the user
user_features = ['money_out', 'money_in', 'rolling_balance', 'day_of_week', 'month',
                 'merchant_encoded', 'transaction_type_encoded'] #+ list(user_tfidf_df.columns)
user_X = user_df[user_features]

# Step 3: Load the trained model

# Step 4: Fine-tune the model on user data
# You can train with a lower learning rate or additional epochs
autoencoder.fit(user_X, user_X, epochs=10, batch_size=5, validation_split=0.1, verbose=1)

# Step 5: Evaluate the fine-tuned model on user data
user_reconstructed = autoencoder.predict(user_X)
user_mse = np.mean(np.power(user_X - user_reconstructed, 2), axis=1)

# Adding MSE to user DataFrame
user_df['mse'] = user_mse
user_df['anomaly'] = user_df['mse'] > threshold  # Set a threshold based on your previous evaluation

# Step 6: Analyze results
print(user_df[['transaction_date', 'mse', 'anomaly']])

Epoch 1/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 588061016064.0000 - val_loss: 262004031488.0000
Epoch 2/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 520263991296.0000 - val_loss: 262004031488.0000
Epoch 3/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 538233569280.0000 - val_loss: 262004031488.0000
Epoch 4/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 451992092672.0000 - val_loss: 262004031488.0000
Epoch 5/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 629567848448.0000 - val_loss: 262004031488.0000
Epoch 6/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 592128245760.0000 - val_loss: 262004031488.0000
Epoch 7/10
[1m197/197[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 545160691712.0000 - val_loss: 262004031488.0000
Epoch 

In [None]:
# Function to detect anomalies
def detect_anomalies(data, model, threshold):
    # Scale the input data
    data_scaled = scaler.transform(data)
    # Get reconstructed data
    reconstructed = model.predict(data_scaled)
    # Calculate reconstruction errors
    reconstruction_errors = np.mean(np.square(data_scaled - reconstructed), axis=1)
    
    # Identify anomalies
    anomalies = reconstruction_errors > threshold
    return anomalies, reconstruction_errors

# Set a threshold for anomalies (could be determined based on validation set)
threshold = 0.05  # Adjust this based on your validation results

# Use the global model to detect anomalies in the entire dataset
anomalies_global, reconstruction_errors_global = detect_anomalies(X_scaled, autoencoder, threshold)

# Use the fine-tuned model to detect anomalies in user-specific data
anomalies_user, reconstruction_errors_user = detect_anomalies(X_user_scaled, autoencoder, threshold)

# Add results to the DataFrame for better understanding
df['anomaly_global'] = anomalies_global
df['anomaly_user'] = np.nan  # Placeholder for user-specific anomalies
df.loc[df['user'] == 'user_1', 'anomaly_user'] = anomalies_user

# View the results
print(df.head(10))

exploration of users

In [None]:
users = transactional_data.groupby('user').aggregate(
    total_money_in=('money_in', 'sum'),
    total_money_out=('money_out', 'sum'),
    initial_balance=('rolling_balance', 'first'),
    final_balance=('rolling_balance', 'last'),
)

X_combined

### Read User behaviors and Non statistical exploration

#### Typing Data

In [None]:
key_stroke = pd.read_csv("../DataSets/UserBehaviors/Keystroke/free-text.csv")

key_stroke = key_stroke.iloc[:, :-1]

# Get the unique user IDs
first_10_user_ids = key_stroke['participant'].unique()[:10]

# Filter the data for the first 10 unique users
first_10_users_data = key_stroke[key_stroke['participant'].isin(first_10_user_ids)]

# Drop unneeded columns
first_10_users_data = first_10_users_data.drop(first_10_users_data.columns[-1], axis=1)

# Display head
key_stroke.isnull().sum()

#### Mouse movement && Session Info

In [None]:
session_info = pd.read_csv("../DataSets/UserBehaviors/mousedynamics/EVTRACKINFO/EVTRACKINFO.csv", sep='\t')
mouse_movements = pd.read_csv("../DataSets/UserBehaviors/mousedynamics/EVTRACKTRACK/EVTRACKTRACK.csv", sep='\t')

# Drop unneeded columns
del session_info['_id']
del mouse_movements['_id']
del mouse_movements['cursor']


# Filter uneed data
mouse_data = mouse_movements[mouse_movements['event'].str.contains('mouse', case=False, na=False)]

# Display head
mouse_data

mouse_data.isnull().sum()

## Looking for missing values