<a href="https://colab.research.google.com/github/mdazad173824/PDS-Project/blob/main/credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded=files.upload()
print(uploaded)

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!kaggle datasets download -d nishitsanghvi/credit-card-fraud-detection-logistic-regression

In [None]:
import zipfile
with zipfile.ZipFile('/content/creditcardfraud.zip', 'r') as zip_ref:
    zip_ref.extractall('data_folder')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("/content/data_folder/creditcard.csv")
print("Dataset Loaded Successfully!")
data.info()

In [None]:
print(data.head())

In [None]:
# Get unique values of the 'Class' column
unique_classes = data['Class'].unique()

print("Unique values in the 'Class' column:", unique_classes)

In [None]:
#The Amount and Time columns should be scaled because they have different ranges from the other features.

from sklearn.preprocessing import StandardScaler

# Scale the 'Amount' and 'Time' columns
data['Normalized_Amount'] = StandardScaler().fit_transform(data[['Amount']])
data['Normalized_Time'] = StandardScaler().fit_transform(data[['Time']])

# Drop the original 'Amount' and 'Time' columns
data = data.drop(columns=['Amount', 'Time'])

print(data.head())

In [None]:
import matplotlib.pyplot as plt

class_counts = data['Class'].value_counts()
class_counts.plot(kind='bar', color=['blue', 'red'])

class_proportions = data['Class'].value_counts(normalize=True)
print(class_proportions)

plt.title('Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.xticks([0, 1], ['Non-Fraud', 'Fraud'], rotation=0)
plt.show()

In [None]:
# Separate features and target
X = data.drop(columns=['Class'])
y = data['Class']

# Check class distribution
print("Class distribution before SMOTE:\n", y.value_counts())

from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Check class distribution after SMOTE
print("Class distribution after SMOTE:\n", pd.Series(y_train_smote).value_counts())

In [None]:
# Check shape of the new training data
print("Shape of X_train before SMOTE:", X_train.shape)
print("Shape of X_train after SMOTE:", X_train_smote.shape)

# Preview the data
print(X_train_smote.head())
print(y_train_smote.head())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score


# Step 6: Train a Logistic Regression model
model_lr = LogisticRegression(max_iter=1000, random_state=42)
model_lr.fit(X_train_smote, y_train_smote)

# Step 7: Evaluate the model on the test set
y_pred = model_lr.predict(X_test)

# Step 8: Print evaluation metrics
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, model_lr.predict_proba(X_test)[:, 1]))

# Optional: Display class distributions
print("\nClass Distribution Before SMOTE:")
print(y.value_counts())
print("\nClass Distribution After SMOTE:")
print(pd.Series(y_train_smote).value_counts())

In [None]:
#Confusion Matrix Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#Area Under the Precision-Recall Curve (AUPRC)

from sklearn.metrics import precision_recall_curve, auc

# Get prediction probabilities for the positive class
y_scores = model_lr.predict_proba(X_test)[:, 1]

# Compute precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)

# Calculate the AUPRC
auprc = auc(recall, precision)
print(f"Area Under the Precision-Recall Curve (AUPRC): {auprc:.2f}")


import matplotlib.pyplot as plt

# Plot the Precision-Recall curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'Precision-Recall Curve (AUC = {auprc:.2f})', color='blue')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, precision_recall_curve

# Calculate probabilities for the test set
y_scores = model_lr.predict_proba(X_test)[:, 1]  # Get fraud probabilities

# Define a range of thresholds
thresholds = np.linspace(0, 1, 20)  # From 0 to 1 in 10 steps

# Function to plot confusion matrices
def plot_confusion_matrices(thresholds, y_test, y_scores):
    fig, axes = plt.subplots(4, 5, figsize=(20, 20))  # Create subplots for 10 thresholds
    axes = axes.ravel()  # Flatten the axes for easy iteration

    for i, threshold in enumerate(thresholds):
        # Generate predictions based on the threshold
        y_pred = (y_scores >= threshold).astype(int)

        # Compute the confusion matrix
        cm = confusion_matrix(y_test, y_pred)

        # Plot the confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
        axes[i].set_title(f"Threshold: {threshold:.2f}")
        axes[i].set_xlabel("Predicted")
        axes[i].set_ylabel("Actual")

    plt.tight_layout()
    plt.show()

# Plot confusion matrices for the thresholds
plot_confusion_matrices(thresholds, y_test, y_scores)