In [None]:
!pip install datasets

# **Importing the DataSet**

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import time
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, GRU, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score



In [None]:
# Loading the dataset from Hugging Face
dataset = load_dataset("tanzuhuggingface/creditcardfraudtraining")

In [None]:
df = pd.DataFrame(dataset['train'])

In [None]:
# Display the first few rows of the DataFrame
df.head()  # This helps in understanding the structure and sample data of the DataFrame

In [None]:
# Display summary information about the DataFrame
df.info()  # Provides an overview of the dataset, including column names, data types, and non-null values

In [None]:
# Count the occurrences of each unique value in the "is_fraud" column
df["is_fraud"].value_counts()  # Helps in understanding the class distribution (fraud vs. non-fraud cases)

In [None]:
#Checking for missing values in the dataset
df.isnull().sum()  # Displays the count of missing values for each column

In [None]:
# Display basic statistical summary of the dataset

df.describe()  # Provides statistics like mean, standard deviation, min, max, and quartiles for numerical columns

In [None]:
# Calculate the percentage of fraud and non-fraud cases
fraud_percentage = df['is_fraud'].value_counts(normalize=True) * 100

# Display the class distribution in percentage
fraud_percentage  # Helps in understanding class imbalance in the dataset

# **Drop the specified columns**

In [None]:
# Define the list of unnecessary columns to drop
columns_to_drop = ['id', 'index']  # These columns may not contribute to model training or analysis

# Drop the specified columns from the DataFrame
df = df.drop(columns=columns_to_drop)

# Display confirmation message
print("Columns dropped successfully.")

In [None]:
df.info()

# **EDA (Exploratry Data Analysis)**

**Class Disstribution**

In [None]:
# Set figure size for better visualization
plt.figure(figsize=(6, 6))

# Create a count plot for fraud vs. non-fraud distribution
sns.countplot(data=df, x='is_fraud', palette=["#FF9999", "#66B2FF"])  # Custom colors for better distinction

# Add title and labels for better understanding
plt.title("Class Distribution (Fraud vs. Non-Fraud)", fontsize=14, fontweight='bold')
plt.xlabel("Fraud Status (0 = Non-Fraud, 1 = Fraud)", fontsize=12)
plt.ylabel("Count", fontsize=12)

# Customize x-axis labels for clarity
plt.xticks(ticks=[0, 1], labels=["Non-Fraud", "Fraud"], fontsize=11)

# Remove unnecessary legend to avoid redundancy
plt.show()

In [None]:
# Define figure size
plt.figure(figsize=(12, 8))

# Loop through all columns except "is_fraud"
for column in df.columns:
    if column != "is_fraud":
        plt.figure(figsize=(6, 4))  # Set individual plot size

        # Check if the column is categorical or numerical
        if df[column].dtype == 'object' or df[column].nunique() < 10:  # Categorical or low-cardinality numeric
            sns.countplot(data=df, x=column, hue="is_fraud", palette=["#66B2FF", "#FF9999"])
        else:  # Numerical columns
            sns.histplot(data=df, x=column, hue="is_fraud", element="step", palette=["#66B2FF", "#FF9999"], bins=30, kde=True)

        # Set titles and labels
        plt.title(f"Distribution of {column} by Fraud Status", fontsize=12, fontweight='bold')
        plt.xlabel(column, fontsize=11)
        plt.ylabel("Count", fontsize=11)
        plt.legend(title="Fraud Status", labels=["Non-Fraud", "Fraud"])
        plt.xticks(rotation=45)  # Rotate labels for better readability if necessary
        plt.show()


In [None]:
# Set figure size for better readability
plt.figure(figsize=(8, 8))

# Compute the correlation matrix
corr_matrix = df.corr()

# Create the heatmap with a different color scheme
sns.heatmap(
    corr_matrix,
    cmap="viridis",  # Changed color scheme to 'viridis' for better contrast
    annot=True,
    fmt=".2f",
    annot_kws={"size": 14, "weight": "bold"},  # Adjust font size and weight for annotations
    linewidths=0.5,  # Add gridlines between cells for better separation
    linecolor="white",  # Grid color
    cbar_kws={"shrink": 0.75}  # Adjust color bar size
)

# Set title and labels
plt.title("Correlation Heatmap of Features", fontsize=18, fontweight="bold", color="#333333")  # Darker title color
plt.xticks(fontsize=14, fontweight='bold', rotation=45, color="#444444")  # Rotate x-axis labels for clarity
plt.yticks(fontsize=14, fontweight='bold', color="#444444")

# Show the plot
plt.show()


In [None]:
# Separating features (X) and target variable (y)
X = df.drop(columns=['is_fraud'])  # Drop the target column to keep only predictor features
y = df['is_fraud']  # Target variable containing fraud labels (0 = Non-Fraud, 1 = Fraud)



In [None]:
X

In [None]:
X.shape

In [None]:
y.shape