In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler  # Import StandardScaler

# Step 1: Load the dataset from the specified path with a different encoding
file_path = r'C:\Users\saira\Downloads\Credit-Card-Fraud-Detection\data\creditcard.csv'  # Update with the correct file path

# Try loading the file with ISO-8859-1 encoding
try:
    data = pd.read_csv(file_path, encoding='ISO-8859-1')  # Using ISO-8859-1 encoding
    print("Dataset Loaded Successfully with ISO-8859-1 encoding")
except UnicodeDecodeError:
    # If ISO-8859-1 doesn't work, try utf-16 encoding
    data = pd.read_csv(file_path, encoding='utf-16')
    print("Dataset Loaded Successfully with UTF-16 encoding")

# Step 2: Check the column names in the dataset to identify the correct target column
print("Columns in the dataset:")
print(data.columns)  # Display the column names

# Step 3: Print the first few rows to inspect the data and check for the target column
print("Dataset Head:")
print(data.head())

# Step 4: Update the target column based on the correct column name
target_column = 'Class'  # 'Class' is assumed to be the target column for fraud detection (0: Non-fraud, 1: Fraud)

# Step 5: Handle missing values if 'Class' is found
if target_column in data.columns:
    data[target_column] = data[target_column].fillna(data[target_column].mean())  # Replace missing fraud data with the mean value
else:
    print(f"Column '{target_column}' not found in the dataset. Please check the column name.")

# Step 6: Drop non-numeric columns for feature engineering
# Remove columns that are not useful for modeling like 'Time' or any other non-numeric columns if applicable
data = data.drop(['Time'], axis=1)  # For example, if 'Time' is not useful

# Step 7: Prepare the feature set (X) and target variable (y)
X = data.drop(target_column, axis=1)  # Independent variables (all columns except 'Class')
y = data[target_column]  # Target variable (fraud detection)

# Step 8: Feature Scaling
scaler = StandardScaler()  # Ensure StandardScaler is imported
X_scaled = scaler.fit_transform(X)

# Step 9: Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 10: Model Selection and Training
# Using Random Forest Classifier (you can use other models like Logistic Regression, XGBoost, etc.)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 11: Model Evaluation
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Classification Report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

# Accuracy, Precision, Recall, F1-Score
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy}")
print(f"ROC-AUC Score: {roc_auc}")

# Step 12: Visualization (Confusion Matrix Heatmap)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Optional: Cross-validation (for model evaluation)
cross_val_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy')
print(f"Cross-validation scores: {cross_val_scores}")
print(f"Mean cross-validation score: {cross_val_scores.mean()}")


Dataset Loaded Successfully with ISO-8859-1 encoding
Columns in the dataset:
Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')
Dataset Head:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539 