<a href="https://colab.research.google.com/github/nissysathwika/Anamoly-detection/blob/main/AutoEncoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models



# Step 1: Load the dataset
data = pd.read_csv('transaction.csv')

In [None]:
# Handle missing values for numeric columns only
for col in data.select_dtypes(include=np.number):
    data[col] = data[col].fillna(data[col].median())

# Handle missing values for non-numeric columns (e.g., using mode or a constant)
for col in data.select_dtypes(exclude=np.number):
    # Example: fill with the most frequent value (mode)
    data[col] = data[col].fillna(data[col].mode()[0])

In [None]:
# Convert Timestamp to datetime
data['Timestamp'] = pd.to_datetime(data['Timestamp'], errors='coerce')
data.dropna(subset=['Timestamp'], inplace=True)  # Drop rows where conversion failed

# Extract useful features from Timestamp
data['Hour'] = data['Timestamp'].dt.hour
data['DayOfWeek'] = data['Timestamp'].dt.dayofweek

In [None]:
# Drop columns that are not useful for classification (e.g., Timestamp, TransactionID)
data.drop(columns=['Timestamp', 'TransactionID'], inplace=True)


In [None]:
print(data.columns)

Index(['AccountID', 'Amount', 'Merchant', 'TransactionType', 'Location',
       'Hour', 'DayOfWeek'],
      dtype='object')


In [None]:
# One-hot encoding for categorical columns (AccountID, Merchant, TransactionType, Location)
data_encoded = pd.get_dummies(data, columns=['AccountID', 'Merchant', 'TransactionType', 'Location'], drop_first=True)

# Features (X) and target variable (y - assuming Amount is indicative of anomalies)
X = data_encoded.drop(columns=['Amount'])
y = data_encoded['Amount']

In [None]:
# Step 3: Feature Selection using RFE
# Using Linear Regression as a base estimator for RFE
from sklearn.linear_model import LinearRegression # Import LinearRegression
model = LinearRegression() # Changed model to LinearRegression
rfe = RFE(model, n_features_to_select=10)  # Select top 10 features
fit = rfe.fit(X, y)

In [None]:
# Selected features
selected_features = X.columns[fit.support_]
print("Selected Features: ", selected_features)

# Update X to use only selected features
X_selected = X[selected_features]

Selected Features:  Index(['AccountID_ACC10', 'AccountID_ACC11', 'AccountID_ACC13',
       'AccountID_ACC5', 'AccountID_ACC9', 'Merchant_MerchantE',
       'Merchant_MerchantF', 'Location_Los Angeles', 'Location_New York',
       'Location_San Francisco'],
      dtype='object')


In [None]:
# Step 4: Data Preprocessing - Normalize the selected features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_selected)


In [None]:
# Split the data into train and test sets
X_train, X_test = train_test_split(X_scaled, test_size=0.2, random_state=42)


In [None]:
# Step 5: Build the Autoencoder model
input_dim = X_train.shape[1]  # Input size is the number of selected features
encoding_dim = 8  # Number of nodes in the bottleneck layer

In [None]:
# Define the Autoencoder architecture
autoencoder = models.Sequential([
    layers.Input(shape=(input_dim,)),
    layers.Dense(16, activation='relu'),
    layers.Dense(encoding_dim, activation='relu'),  # Bottleneck layer
    layers.Dense(16, activation='relu'),
    layers.Dense(input_dim, activation='linear')  # Output layer for reconstruction
])

In [None]:
# Compile the model
autoencoder.compile(optimizer='adam', loss='mse')

In [None]:
# Train the Autoencoder
history = autoencoder.fit(X_train, X_train, epochs=50, batch_size=256, validation_data=(X_test, X_test), verbose=0)

In [None]:
# Step 6: Predict on the test data (reconstruction)
X_train_pred = autoencoder.predict(X_train)
X_test_pred = autoencoder.predict(X_test)


[1m420/420[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:
# Compute reconstruction error for train and test data
train_reconstruction_error = np.mean(np.square(X_train - X_train_pred), axis=1)
test_reconstruction_error = np.mean(np.square(X_test - X_test_pred), axis=1)

In [None]:
# Step 7: Set a threshold for anomalies based on training reconstruction error
threshold = np.mean(train_reconstruction_error) + 3 * np.std(train_reconstruction_error)

In [None]:
# Identify anomalies in the test set
anomalies = test_reconstruction_error > threshold

In [None]:
# Count the number of anomalies
anomaly_count = np.sum(anomalies)

print(f"Number of anomalies detected: {anomaly_count}")

Number of anomalies detected: 101


In [None]:
# For labeling purposes, create a binary classification: 1 for anomalies, 0 for normal
y_test_pred = np.array([1 if e > threshold else 0 for e in test_reconstruction_error])

In [None]:
# Since we don't have actual labels, assume normal (0) for the majority
y_true = np.zeros_like(y_test_pred)

In [None]:
from sklearn.metrics import recall_score

# Calculate recall with zero_division set to 0
recall = recall_score(y_true, y_test_pred, average='macro', zero_division=0) # Use y_test_pred instead of y_pred

In [None]:
# Step 8: Calculate accuracy and print classification report
accuracy = accuracy_score(y_true, y_test_pred)
report = classification_report(y_true, y_test_pred, target_names=["Normal", "Anomaly"])

print(f"Accuracy: {accuracy}")
print(report)

Accuracy: 0.9699404761904762
              precision    recall  f1-score   support

      Normal       1.00      0.97      0.98      3360
     Anomaly       0.00      0.00      0.00         0

    accuracy                           0.97      3360
   macro avg       0.50      0.48      0.49      3360
weighted avg       1.00      0.97      0.98      3360



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
