In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from imblearn.over_sampling import SMOTE
from geopy.distance import great_circle
#loading the dataset
fraud_train = pd.read_csv('fraudTrain.csv')
fraud_test = pd.read_csv('fraudTest.csv')

#concatenating the two datasets
data = pd.concat([fraud_train, fraud_test]).reset_index()

data.drop(data.columns[:2], axis=1, inplace=True)
# df.head()
# Load your dataset
# data = pd.read_csv('fraudTest.csv')
print("Original DataFrame:")
print(data.head())


# Drop unnecessary columns
data.drop(['trans_date_trans_time', 'merchant', 'category', 'gender', 
           'first', 'last', 'street', 'city', 'state', 
           'job', 'dob', 'trans_num'], axis=1, inplace=True)
# Calculate distance feature
data['distance'] = data.apply(lambda row: great_circle((row['lat'], row['long']), 
                                                         (row['merch_lat'], row['merch_long'])).kilometers, axis=1)

# Feature selection: Focus on latitude and longitude features along with other relevant features
features = ['zip','lat', 'long', 'merch_lat', 'merch_long', 'unix_time', 'distance']
X = data[features]
y = data['is_fraud']  # Target variable
normal = data[data['is_fraud']==0]
fraud = data[data['is_fraud']==1]
fraud_summary = data['is_fraud'].value_counts()

# Print the summary
print("Fraud Summary:")
print(fraud_summary)

# For clearer labeling
print(f"\nNumber of non-fraudulent transactions: {fraud_summary[0]}")
print(f"Number of fraudulent transactions: {fraud_summary[1]}")
# Preprocess the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# # Apply SMOTE to handle class imbalance
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_scaled, y)


# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

# # Reshape the data for CNN input (samples, time steps, features)
# sequence_length = 5  # Define the sequence length
# n_samples = X_resampled.shape[0] // sequence_length
# num_features = X_resampled.shape[1]

# X_reshaped = X_resampled[:n_samples * sequence_length].reshape((n_samples, sequence_length, num_features))
# y_array = y_resampled[:n_samples * sequence_length].reshape((n_samples, sequence_length))
# y_final = y_array[:, -1]

# # Split the reshaped data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_final, test_size=0.2, random_state=42)


# Reshape the data for CNN input (samples, time steps, features)
sequence_length = 5  # Define the sequence length
n_samples = X_resampled.shape[0] // sequence_length
num_features = X_resampled.shape[1]

X_reshaped = X_resampled[:n_samples * sequence_length].reshape((n_samples, sequence_length, num_features))
# Convert y_resampled to a NumPy array for reshaping
y_array = y_resampled.to_numpy()[:n_samples * sequence_length].reshape((n_samples, sequence_length))
y_final = y_array[:, -1]  # Use the last value in each sequence as the target label
X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_final, test_size=0.2, random_state=42)

import numpy as np

# Count the occurrences of 0 and 1 in y_train
unique_train, counts_train = np.unique(y_train, return_counts=True)
print("Counts in y_train:")
print(dict(zip(unique_train, counts_train)))

# Count the occurrences of 0 and 1 in y_test
unique_test, counts_test = np.unique(y_test, return_counts=True)
print("\nCounts in y_test:")
print(dict(zip(unique_test, counts_test)))

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    roc_curve, auc, classification_report, accuracy_score, confusion_matrix,
    precision_score, recall_score, f1_score
)
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# # Reshape the data for CNN input (samples, time steps, features)
# sequence_length = 5  # Define the sequence length
# n_samples = X_scaled.shape[0] // sequence_length
# num_features = X_scaled.shape[1]

# X_reshaped = X_scaled[:n_samples * sequence_length].reshape((n_samples, sequence_length, num_features))
# y_array = y_resampled.to_numpy()[:n_samples * sequence_length].reshape((n_samples, sequence_length))
# y_final = y_array[:, -1]

# # Split the reshaped data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_reshaped, y_final, test_size=0.2, random_state=42)

# Convert labels to categorical format for multi-class problems (not needed for binary classification)
# num_classes = len(np.unique(y_train))
# y_train_cat = to_categorical(y_train, num_classes)
# y_test_cat = to_categorical(y_test, num_classes)

# Build the CNN model
cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
    MaxPooling1D(pool_size=2),
    Dropout(0.2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = cnn_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluate the model
loss, accuracy = cnn_model.evaluate(X_test, y_test, verbose=0)
print("\nEvaluation Metrics for CNN:")
print(f"Accuracy: {accuracy:.15f}")

# Make predictions
y_pred_prob = cnn_model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

# Calculate metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Precision: {precision:.15f}')
print(f'Recall: {recall:.15f}')
print(f'F1 Score: {f1:.15f}')

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Class 0', 'Class 1'], yticklabels=['Class 0', 'Class 1'])
plt.title('Confusion Matrix - CNN Model')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})', color='blue')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.grid()
plt.show()

# Training and Validation Accuracy Curve
plt.plot(history.history['accuracy'], label='Training Accuracy', marker='o')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()
plt.grid()
plt.show()

# Training and Validation Loss Curve
plt.plot(history.history['loss'], label='Training Loss', marker='o')
plt.plot(history.history['val_loss'], label='Validation Loss', marker='o')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid()
plt.show()

# F1 Score Over Epochs
train_f1_scores = []
for epoch in range(1, 21):
    temp_y_pred = (cnn_model.predict(X_train, batch_size=32) > 0.5).astype(int).flatten()
    temp_f1 = f1_score(y_train, temp_y_pred)
    train_f1_scores.append(temp_f1)

plt.plot(range(1, 21), train_f1_scores, label='Training F1 Score', marker='o', color='red')
plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('Training F1 Score Over Epochs')
plt.legend()
plt.grid()
plt.show()

