<a href="https://colab.research.google.com/github/monkrus/anomaly_detect/blob/main/Anomaly1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import zipfile
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, roc_curve, auc
import matplotlib.pyplot as plt
import seaborn as sns
import glob


In [None]:
# Set up the Kaggle API
os.environ['KAGGLE_USERNAME'] = 'monkrus1'  # replace with your Kaggle username
os.environ['KAGGLE_KEY'] = '9dcd3a1c53f3511a034baeba228b6491'  # replace with your Kaggle key


In [None]:

# Download the dataset
os.system('kaggle datasets download -d mlg-ulb/creditcardfraud')

# Unzip the dataset
with zipfile.ZipFile('creditcardfraud.zip', 'r') as zip_ref:
    zip_ref.extractall()

# Load the dataset
data = pd.read_csv('creditcard.csv')  # Update the filename if needed



In [None]:
# Inspect the data
print(data.head())
print(data.info())
print(data.describe())

In [None]:
# Separate features and labels
X = data.drop('Class', axis=1)
y = data['Class']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Preprocess the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Create the anomaly detection model
model = IsolationForest()

In [None]:
# Define the grid of hyperparameters to search
grid = {'contamination': [0.001, 0.01, 0.1]}


In [None]:
# Perform grid search
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, grid, scoring='roc_auc', cv=cv)
grid_search.fit(X_train, y_train)


In [None]:
# Get the best model
model = grid_search.best_estimator_

In [None]:
# Train the model on the training dataset
model.fit(X_train)

In [None]:
# Predict outliers/anomalies
scores_test = model.decision_function(X_test)
predictions = model.predict(X_test)

In [None]:
# Adjust the labels for the confusion matrix and classification report
y_test_adj = y_test.copy()
y_test_adj[y_test_adj == 1] = -1
y_test_adj[y_test_adj == 0] = 1
predictions[predictions == 1] = 0
predictions[predictions == -1] = 1


In [None]:
# Evaluate the performance
print("Overall Metrics:")
print(classification_report(y_test_adj, predictions))
print("Confusion Matrix:")
print(confusion_matrix(y_test_adj, predictions))


In [None]:

# Plot precision-recall curve
precision, recall, _ = precision_recall_curve(y_test_adj, scores_test)
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, color='b', alpha=0.2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.show()


In [None]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test_adj, scores_test)
roc_auc = auc(fpr, tpr)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test_adj, predictions)
sns.heatmap(cm, annot=True, cmap='Blues', fmt='d', cbar=False)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()