<a href="https://colab.research.google.com/github/nissysathwika/Anamoly-detection/blob/main/DBSCAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import DBSCAN
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score

# Step 1: Load the dataset
data = pd.read_csv('transaction.csv')

In [None]:
# Step 2: Data Exploration
print(data.head())
print(data.isnull().sum())
print(data.describe())

          Timestamp TransactionID AccountID    Amount   Merchant  \
0  01-01-2023 08:00       TXN1127      ACC4  95071.92  MerchantH   
1  01-01-2023 08:01       TXN1639     ACC10  15607.89  MerchantH   
2  01-01-2023 08:02        TXN872      ACC8  65092.34  MerchantE   
3  01-01-2023 08:03       TXN1438      ACC6     87.87  MerchantE   
4  01-01-2023 08:04       TXN1338      ACC6    716.56  MerchantI   

  TransactionType     Location  
0        Purchase        Tokyo  
1        Purchase       London  
2      Withdrawal       London  
3        Purchase       London  
4        Purchase  Los Angeles  
Timestamp          481
TransactionID      481
AccountID          481
Amount             481
Merchant           481
TransactionType    481
Location           481
dtype: int64
              Amount
count  216960.000000
mean    50090.025108
std     29097.905016
min        10.510000
25%     25061.242500
50%     50183.980000
75%     75080.460000
max    978942.260000


In [None]:
# Step 3: Preprocessing
# Convert Timestamp to datetime
data['Timestamp'] = pd.to_datetime(data['Timestamp'], format='%d-%m-%Y %H:%M')

In [None]:
# Step 4: Handling Categorical Variables
encoder = OneHotEncoder(drop='first', sparse_output=False) # Use sparse_output instead of sparse
encoded_cols = encoder.fit_transform(data[['Merchant', 'TransactionType', 'Location']])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(['Merchant', 'TransactionType', 'Location']))

In [None]:
# Step 5: Normalize Numerical Features
scaler = StandardScaler()
scaled_amount = scaler.fit_transform(data[['Amount']])  # Normalize 'Amount' feature

In [None]:
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import numpy as np

combined_data = np.hstack((scaled_amount, encoded_df))

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')
combined_data = imputer.fit_transform(combined_data)

# Dimensionality Reduction
pca = PCA(n_components=0.95)  # Keep 95% variance
reduced_data = pca.fit_transform(combined_data)

# Apply DBSCAN
dbscan = DBSCAN(eps=0.7, min_samples=5)  # Adjust these values
dbscan.fit(reduced_data)

In [None]:
# Step 8: Assign Labels (Anomalies are labeled as -1 by DBSCAN)
data['DBSCAN_Label'] = dbscan.labels_

In [None]:
# Step 9: Identify Anomalies
anomalies = data[data['DBSCAN_Label'] == -1]
print("Number of Anomalous Transactions:", len(anomalies))
print(anomalies[['Timestamp', 'TransactionID', 'AccountID', 'Amount', 'Merchant', 'TransactionType', 'Location']].head())

Number of Anomalous Transactions: 13
               Timestamp TransactionID AccountID     Amount   Merchant  \
2775 2023-01-03 06:15:00       TXN1049     ACC14  978942.26  MerchantJ   
3448 2023-01-03 17:28:00        TXN827     ACC12  712076.97  MerchantA   
3453 2023-01-03 17:33:00       TXN1690      ACC8  489492.30  MerchantF   
4148 2023-01-04 05:08:00        TXN625      ACC1  187344.37  MerchantD   
4194 2023-01-04 05:54:00        TXN667      ACC3  272990.11  MerchantE   

     TransactionType       Location  
2775        Transfer       New York  
3448      Withdrawal          Tokyo  
3453        Purchase          Tokyo  
4148        Purchase  San Francisco  
4194        Transfer    Los Angeles  


In [None]:
if len(np.unique(dbscan.labels_)) > 1:
  silhouette_avg = silhouette_score(reduced_data, dbscan.labels_)
  print(f'Silhouette Score: {silhouette_avg:.2f}')
else:
  print("Silhouette Score cannot be calculated with only one cluster.")

Silhouette Score: 0.25
