# **1. Import Required Libraries**

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from scipy.sparse import csr_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict

# **2. Load Dataset**

In [4]:
# Load dataset
# Replace 'your_dataset.csv' with your actual file
df = pd.read_csv("Bank_Transaction_Fraud_Detection.csv")

# Quick look at dataset
print(df.head())
print(df.shape)

                            Customer_ID        Customer_Name  Gender  Age  \
0  d5f6ec07-d69e-4f47-b9b4-7c58ff17c19e           Osha Tella    Male   60   
1  7c14ad51-781a-4db9-b7bd-67439c175262      Hredhaan Khosla  Female   51   
2  3a73a0e5-d4da-45aa-85f3-528413900a35       Ekani Nazareth    Male   20   
3  7902f4ef-9050-4a79-857d-9c2ea3181940  Yamini Ramachandran  Female   57   
4  3a4bba70-d9a9-4c5f-8b92-1735fd8c19e9         Kritika Rege  Female   43   

         State                City                Bank_Branch Account_Type  \
0       Kerala  Thiruvananthapuram  Thiruvananthapuram Branch      Savings   
1  Maharashtra              Nashik              Nashik Branch     Business   
2        Bihar           Bhagalpur           Bhagalpur Branch      Savings   
3   Tamil Nadu             Chennai             Chennai Branch     Business   
4       Punjab            Amritsar            Amritsar Branch      Savings   

                         Transaction_ID Transaction_Date  ...  \
0  

# **Insight**
This dataset contains detailed records of 200,000 bank customers and their transactions, including demographic information (Customer_ID, Name, Gender, Age), location data (State, City, Bank_Branch), account details (Account_Type, Account_Balance), transaction specifics (Transaction_ID, Date, Merchant_Category, Description, Device, Location, Currency), and contact information (Customer_Contact, Email). It captures both behavioral and contextual aspects of banking activity, such as the type of device used, transaction location, and fraud indicators, making it suitable for applications like fraud detection, recommender systems, and customer behavior analysis.

# **Encode categorical columns**

In [5]:
le_type = LabelEncoder()
df["type_encoded"] = le_type.fit_transform(df["Transaction_Description"])

le_nameOrig = LabelEncoder()
df["nameOrig_encoded"] = le_nameOrig.fit_transform(df["Customer_ID"])

le_nameDest = LabelEncoder()
df["nameDest_encoded"] = le_nameDest.fit_transform(df["Merchant_Category"])

le_nameDest = LabelEncoder()
df["amount_encoded"] = le_nameDest.fit_transform(df["Account_Balance"])

le_nameDest = LabelEncoder()
df["age_encoded"] = le_nameDest.fit_transform(df["Age"])

# **Balance dataset with SMOTE**

In [6]:
# Define features and target
X = df[["Account_Balance", "Age", "type_encoded"]]
y = df["Is_Fraud"]

# Apply SMOTE for balancing
smote = SMOTE(sampling_strategy="auto", random_state=42)
X_res, y_res = smote.fit_resample(X, y)

# Create balanced dataframe
df_balanced = pd.DataFrame(X_res, columns=X.columns)
df_balanced["isFraud"] = y_res

# Map back user and transaction context
df_balanced["Customer_ID"] = np.random.choice(df["Customer_ID"].unique(), size=len(df_balanced))
df_balanced["transaction_desc"] = np.random.choice(["ATM withdrawal", "Credit card payment", "Bitcoin transaction"], size=len(df_balanced))
df_balanced["context"] = np.random.choice(["Morning", "Afternoon", "Evening"], size=len(df_balanced))

# **Fraud Detection Filter**

In [7]:
fraud_model = IsolationForest(contamination=0.05, random_state=42)
fraud_model.fit(X_res)
y_pred = fraud_model.predict(X_res)
y_pred = [1 if x == -1 else 0 for x in y_pred]

print("Fraud Detection Evaluation:")
print(classification_report(y_res, y_pred))
print("ROC-AUC:", roc_auc_score(y_res, y_pred))

Fraud Detection Evaluation:
              precision    recall  f1-score   support

           0       0.49      0.93      0.64    189912
           1       0.27      0.03      0.05    189912

    accuracy                           0.48    379824
   macro avg       0.38      0.48      0.34    379824
weighted avg       0.38      0.48      0.34    379824

ROC-AUC: 0.47731580942752433


# **RS Dataset Preparation**

In [8]:
# Use balanced dataset directly
df_rs = df_balanced.sample(n=60000, random_state=42)
users = df_rs["Customer_ID"].unique()

# Train-test split per user
train_data = []
test_data = []

for user in users:
    user_data = df_rs[df_rs["Customer_ID"] == user]
    if len(user_data) < 2:  # skip users with too few transactions
        continue

    train, test = train_test_split(user_data, test_size=0.2, random_state=42)
    train_data.append(train)
    test_data.append(test)

# Combine into DataFrames
train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

print("Train size:", train_df.shape)
print("Test size:", test_df.shape)
print("Unique users in train:", train_df["Customer_ID"].nunique())
print("Unique users in test:", test_df["Customer_ID"].nunique())

Train size: (8108, 7)
Test size: (7350, 7)
Unique users in train: 7350
Unique users in test: 7350


# **3. Content-Based Filtering (TF-IDF Similarity)**

In [9]:
# Content-based filtering (TF-IDF on transaction description)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(train_df["transaction_desc"].unique())
content_sim = cosine_similarity(tfidf_matrix)

content_sim_df = pd.DataFrame(
    content_sim,
    index=train_df["transaction_desc"].unique(),
    columns=train_df["transaction_desc"].unique()
)
print("Content Similarity Table:")
display(content_sim_df.head())

Content Similarity Table:


Unnamed: 0,Credit card payment,ATM withdrawal,Bitcoin transaction
Credit card payment,1.0,0.0,0.0
ATM withdrawal,0.0,1.0,0.0
Bitcoin transaction,0.0,0.0,1.0


In [10]:
def content_rs(user_id, top_n=5):
    user_items = train_df[train_df["Customer_ID"] == user_id]["transaction_desc"].values
    scores = np.zeros(len(train_df))
    for i, item in enumerate(train_df["transaction_desc"]):
        scores[i] = sum(content_sim_df.loc[item, user_item] for user_item in user_items)
    top_idx = np.argsort(scores)[::-1][:top_n]
    return train_df["transaction_desc"].iloc[top_idx].values

# **4. Collaborative Filtering**

In [11]:
# Collaborative filtering (user-item matrix)
user_item_matrix = pd.crosstab(train_df["Customer_ID"], train_df["transaction_desc"])
collab_sim = cosine_similarity(user_item_matrix)
collab_sim_df = pd.DataFrame(collab_sim, index=user_item_matrix.index, columns=user_item_matrix.index)
print("Collaborative Similarity Table:")
display(collab_sim_df.head())

Collaborative Similarity Table:


Customer_ID,00062e09-e953-459b-b7f4-d621a21cd18e,000944b9-6e1a-4a30-a204-097c9eea03ff,000a7364-346c-4b53-bccb-a9c243a72731,000b2a9f-3055-4db8-88e1-04c43b78922e,0019a50e-551b-4686-a55e-7af7898295ea,002517a7-8d95-4719-a046-fee5dcf07c58,00268e74-0db6-4731-9d7b-1f7aadc57ba0,00304910-066c-417f-b994-4ca12e1ead34,00325227-03a2-4f82-bb94-af96853b0aed,0047737e-4e1e-4c2d-b5ee-beb1b78025ae,...,ff88e65c-010f-4eee-bbd6-7cc9a687a89c,ffaf46b5-caf7-4cf7-a42a-bf203e0068f7,ffc70a3c-e106-41c8-8e5b-c788ad3cd749,ffcd0a49-cbe9-42d1-bdfb-b418aae36ae5,ffd0065c-7c65-4d3f-9552-55984c9abb7c,ffd32030-f24a-4c3a-a127-e58d9e843400,ffd9ea17-41a0-4406-8519-787cb6aa6219,ffe0f0ae-7b31-40b6-8439-223b584835f1,ffe9db69-6987-4d18-bc33-1c26fbe7fe9a,ffec59cc-7bd7-43d7-bbab-aec753f081aa
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00062e09-e953-459b-b7f4-d621a21cd18e,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
000944b9-6e1a-4a30-a204-097c9eea03ff,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
000a7364-346c-4b53-bccb-a9c243a72731,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
000b2a9f-3055-4db8-88e1-04c43b78922e,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
0019a50e-551b-4686-a55e-7af7898295ea,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


In [12]:
def collaborative_rs(user_id, top_n=5):
    if user_id not in user_item_matrix.index:
        return []
    user_idx = list(user_item_matrix.index).index(user_id)
    user_sim_scores = collab_sim[user_idx]
    item_scores = np.zeros(user_item_matrix.shape[1])
    for i, other_user in enumerate(user_item_matrix.index):
        item_scores += user_sim_scores[i] * user_item_matrix.iloc[i].values
    top_idx = np.argsort(item_scores)[::-1][:top_n]
    return user_item_matrix.columns[top_idx].values

# 5. Context-Aware Recommender System

In [13]:
# Context similarity (one-hot on context)
context_ohe = pd.get_dummies(train_df["context"])
context_sim = cosine_similarity(context_ohe)
context_sim_df = pd.DataFrame(context_sim, index=train_df["Customer_ID"], columns=train_df["Customer_ID"])
print("Context Similarity Table:")
display(context_sim_df.head())

Context Similarity Table:


Customer_ID,c75860b1-9a16-437a-93de-8ff3b830e4de,4e9e8da8-9435-4a17-a5a0-932c8cba5858,acd4a93b-f210-47c1-96be-ca58dcdc694c,ea005991-b83e-4815-9698-f8fd6a1cb4e5,4fb1089f-8047-43e7-8b73-e264de398604,7c4bd202-0d8e-4f53-aabf-271f228d5606,ed72c295-778e-4afb-942c-12212e004e70,006dc3a2-348f-4c1b-9ef4-2e796c2639f3,803f6942-8a26-412c-ab7c-38f80e783540,d4bd58f9-5b05-4b83-862c-88c5b2113676,...,b040d934-ed1c-4f41-82c8-4d809cff93dc,d626494c-fe9c-4990-b600-a37eb9183a76,ea1cdcbb-3705-4bb5-8273-321e549448d7,c1adee8e-33b1-49f2-afc0-13452dc6b54b,094819ea-1901-48ed-a375-55afaa28357f,99ca1ec8-718a-4717-af75-34e4a29eebf1,306468dd-ad44-4e4e-a100-a52e91b88915,af2c89fb-bd05-4dfe-8666-6b68eb72cc38,2e0b03f1-609b-4a55-b45b-c95617802e4a,4e6cbc44-b3b1-4471-8e88-fc8b5c0c29fa
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
c75860b1-9a16-437a-93de-8ff3b830e4de,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
4e9e8da8-9435-4a17-a5a0-932c8cba5858,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0
acd4a93b-f210-47c1-96be-ca58dcdc694c,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ea005991-b83e-4815-9698-f8fd6a1cb4e5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4fb1089f-8047-43e7-8b73-e264de398604,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


In [14]:
def context_rs(user_id, top_n=5):
    if user_id not in train_df["Customer_ID"].values:
        return []
    user_idx = list(train_df["Customer_ID"].unique()).index(user_id)
    sim_scores = context_sim[user_idx]
    # Recommend top transactions by similar users
    top_user_idx = np.argsort(sim_scores)[::-1][:top_n]
    top_transactions = []
    for idx in top_user_idx:
        top_transactions.extend(train_df.iloc[idx]["transaction_desc"])
    return np.unique(top_transactions)[:top_n]

# **6. Hybrid Recommender System(Content-Based + Collabrative)**

In [15]:
# -------------------------------
# Hybrid Content + Collaborative similarity
# -------------------------------

alpha = 0.5  # weight for content

# 1️⃣ Collaborative scores (user-user similarity @ user-item interactions)
# This gives users x items directly
collab_scores_matrix = collab_sim @ user_item_matrix.values  # shape: (num_users, num_items)

# 2️⃣ Content scores
# Map user interactions to item indices in content_sim
# Unique items
unique_items = train_df["transaction_desc"].unique()

# Mapping: item -> index in content_sim
item_to_index = {item: idx for idx, item in enumerate(unique_items)}

user_item_indices = [
    [item_to_index[item] for item in user_item_matrix.columns[user_item_matrix.iloc[u] > 0]]
    for u in range(user_item_matrix.shape[0])
]

# Compute content scores using matrix operations
content_scores_matrix = np.zeros(user_item_matrix.shape)
for u_idx, item_indices in enumerate(user_item_indices):
    if len(item_indices) > 0:
        content_scores_matrix[u_idx] = content_sim[:, item_indices].sum(axis=1)

# 3️⃣ Combine into hybrid scores
hybrid_scores_matrix = alpha * content_scores_matrix + (1 - alpha) * collab_scores_matrix

# Convert to DataFrame
hybrid_scores_matrix = pd.DataFrame(
    hybrid_scores_matrix,
    index=user_item_matrix.index,
    columns=user_item_matrix.columns
)

# Display sample
print("Hybrid Content + Collaborative Similarity Table (sample):")
display(hybrid_scores_matrix.iloc[:5, :5])

Hybrid Content + Collaborative Similarity Table (sample):


transaction_desc,ATM withdrawal,Bitcoin transaction,Credit card payment
Customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
00062e09-e953-459b-b7f4-d621a21cd18e,1298.768699,67.51896,52.703098
000944b9-6e1a-4a30-a204-097c9eea03ff,53.203098,66.311853,1308.915694
000a7364-346c-4b53-bccb-a9c243a72731,1298.768699,67.51896,52.703098
000b2a9f-3055-4db8-88e1-04c43b78922e,67.01896,1292.680129,66.811853
0019a50e-551b-4686-a55e-7af7898295ea,53.203098,66.311853,1308.915694


In [16]:
# -------------------------------
# Hybrid: Content + Collaborative
# -------------------------------
def hybrid_content_collab(user_id, top_n=5, alpha=0.5):
    if user_id not in user_item_matrix.index:
        return []
    user_idx = list(user_item_matrix.index).index(user_id)

    # Collaborative scores
    item_scores = collab_scores_matrix[user_idx]

    # Content scores
    content_scores = content_scores_matrix[user_idx]

    # Hybrid
    hybrid_scores = alpha * content_scores + (1 - alpha) * item_scores
    top_indices = np.argsort(hybrid_scores)[::-1][:top_n]
    return user_item_matrix.columns[top_indices].values

# **Evaluating RS Model**

In [17]:
# -------------------------------
# Sample 10 users
sample_users = train_df["Customer_ID"].unique()[:10]

# Precompute matrices
# 1️⃣ Content scores
num_users, num_items = user_item_matrix.shape
content_scores_matrix = np.zeros((num_users, num_items))
user_item_indices = [
    [item_to_index[item] for item in user_item_matrix.columns[user_item_matrix.iloc[u] > 0]]
    for u in range(num_users)
]

for u_idx, item_indices in enumerate(user_item_indices):
    if item_indices:
        content_scores_matrix[u_idx] = content_sim[:, item_indices].sum(axis=1)

# 2️⃣ Collaborative scores
collab_scores_matrix = collab_sim @ user_item_matrix.values

# 3️⃣ Context scores
user_context_matrix = pd.crosstab(train_df["Customer_ID"], train_df["context"])
context_sim = cosine_similarity(user_context_matrix)
context_scores_matrix = context_sim @ user_item_matrix.values

# 4️⃣ Hybrid scores
alpha, beta, gamma = 0.4, 0.3, 0.3
hybrid_content_collab_matrix = 0.5 * content_scores_matrix + 0.5 * collab_scores_matrix

# Map user IDs to row indices
user_to_idx = {user: i for i, user in enumerate(user_item_matrix.index)}

# -------------------------------
# Function to get top-k items from a precomputed matrix
def top_k_from_matrix(score_matrix, user_id, k=5):
    if user_id not in user_to_idx:
        return []
    user_idx = user_to_idx[user_id]
    scores = score_matrix[user_idx]
    top_indices = np.argsort(scores)[::-1][:k]
    return user_item_matrix.columns[top_indices].values

# -------------------------------
# Compute recommendations for sampled users
results = defaultdict(dict)
for user in sample_users:
    results[user]["Content-based"] = top_k_from_matrix(content_scores_matrix, user)
    results[user]["Collaborative"] = top_k_from_matrix(collab_scores_matrix, user)
    results[user]["Context-based"] = top_k_from_matrix(context_scores_matrix, user)
    results[user]["Content+Collaborative"] = top_k_from_matrix(hybrid_content_collab_matrix, user)

# Convert to DataFrame
manual_eval_df = pd.DataFrame(results).T
display(manual_eval_df)

Unnamed: 0,Content-based,Collaborative,Context-based,Content+Collaborative
c75860b1-9a16-437a-93de-8ff3b830e4de,"[ATM withdrawal, Credit card payment, Bitcoin ...","[Credit card payment, Bitcoin transaction, ATM...","[Bitcoin transaction, Credit card payment, ATM...","[Credit card payment, Bitcoin transaction, ATM..."
4e9e8da8-9435-4a17-a5a0-932c8cba5858,"[ATM withdrawal, Credit card payment, Bitcoin ...","[Credit card payment, Bitcoin transaction, ATM...","[Bitcoin transaction, Credit card payment, ATM...","[Credit card payment, Bitcoin transaction, ATM..."
acd4a93b-f210-47c1-96be-ca58dcdc694c,"[Bitcoin transaction, Credit card payment, ATM...","[ATM withdrawal, Bitcoin transaction, Credit c...","[ATM withdrawal, Bitcoin transaction, Credit c...","[ATM withdrawal, Bitcoin transaction, Credit c..."
ea005991-b83e-4815-9698-f8fd6a1cb4e5,"[Bitcoin transaction, Credit card payment, ATM...","[ATM withdrawal, Bitcoin transaction, Credit c...","[ATM withdrawal, Bitcoin transaction, Credit c...","[ATM withdrawal, Bitcoin transaction, Credit c..."
4fb1089f-8047-43e7-8b73-e264de398604,"[Bitcoin transaction, Credit card payment, ATM...","[ATM withdrawal, Bitcoin transaction, Credit c...","[Credit card payment, Bitcoin transaction, ATM...","[ATM withdrawal, Bitcoin transaction, Credit c..."
7c4bd202-0d8e-4f53-aabf-271f228d5606,"[ATM withdrawal, Credit card payment, Bitcoin ...","[Credit card payment, Bitcoin transaction, ATM...","[Bitcoin transaction, Credit card payment, ATM...","[Credit card payment, Bitcoin transaction, ATM..."
ed72c295-778e-4afb-942c-12212e004e70,"[ATM withdrawal, Credit card payment, Bitcoin ...","[Credit card payment, Bitcoin transaction, ATM...","[Bitcoin transaction, Credit card payment, ATM...","[Credit card payment, Bitcoin transaction, ATM..."
006dc3a2-348f-4c1b-9ef4-2e796c2639f3,"[ATM withdrawal, Credit card payment, Bitcoin ...","[Credit card payment, Bitcoin transaction, ATM...","[Bitcoin transaction, Credit card payment, ATM...","[Credit card payment, Bitcoin transaction, ATM..."
803f6942-8a26-412c-ab7c-38f80e783540,"[Bitcoin transaction, Credit card payment, ATM...","[ATM withdrawal, Bitcoin transaction, Credit c...","[Credit card payment, Bitcoin transaction, ATM...","[ATM withdrawal, Bitcoin transaction, Credit c..."
d4bd58f9-5b05-4b83-862c-88c5b2113676,"[Credit card payment, Bitcoin transaction, ATM...","[Bitcoin transaction, ATM withdrawal, Credit c...","[ATM withdrawal, Bitcoin transaction, Credit c...","[Bitcoin transaction, ATM withdrawal, Credit c..."
