In [None]:
# import the necessary packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
!wget https://huggingface.co/datasets/JEFFREY-VERDIERE/Creditcard/resolve/main/creditcard.csv -O creditcard.csv
import pandas as pd
df = pd.read_csv("creditcard.csv")

In [None]:
for col in df.columns:
    print(col)

In [None]:
print("Shape of the dataset:", df.shape)
print("\nSummary statistics:")
print(df.describe())

In [None]:
print("Null values in each column:")
print(df.isna().sum())             #shows how many nulls each column has

df = df.dropna()                   #removes any rows that contain at least one null value

print("\nAfter dropping nulls, null values in each column:")
print(df.isna().sum())

In [None]:
df.hist(figsize=(20, 20))  #figsize adjustable
plt.show()

In [None]:
#1 indicates fraud, 0 indicates valid transaction 

#fraud cases
fraud_cases = df[df['Class'] == 1]
print("Number of Fraud Cases:", len(fraud_cases))

#valid cases
valid_cases = df[df['Class'] == 0]
print("Number of Valid Cases:", len(valid_cases))

In [None]:
## Correlation matrix EXAMPLE
corrmat=df.corr()
fig=plt.figure(figsize=(36,25))

sns.heatmap(corrmat, vmax = .8, square = True,annot=True,cmap="coolwarm",linewidth=2)
plt.show()

In [None]:
#get all the columns from the dataFrame 
all_columns = df.columns.tolist()

#filter 
columns = [col for col in all_columns if col != 'Class']

#store the target column separately
target = 'Class'

X = df[columns]     # All columns except 'Class'
Y = df[target]      # Only the 'Class' column

print("Shape of X (features):", X.shape)
print("Shape of Y (target):", Y.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train_sc=sc.fit_transform(X_train)  # convert all data into float data type
X_test_sc=sc.transform(X_test)
X_test_sc.dtype

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [None]:
#train decision tree classifier
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, Y_train)

#predict on test data
y_pred = dt_model.predict(X_test)

#accuracy score
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy Score: {accuracy:.10f}")  

In [None]:
#train with standard scalar
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

#fit and transform training data, transform test data
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

dt_model_scaled = DecisionTreeClassifier(random_state=42)
dt_model_scaled.fit(X_train_sc, Y_train)

#predict and evaluate
y_pred_scaled = dt_model_scaled.predict(X_test_sc)
accuracy_scaled = accuracy_score(Y_test, y_pred_scaled)

print(f"Scaled Accuracy Score: {accuracy_scaled:.10f}")

In [None]:
# Random forest classifier, fit on Xtrain achieved by splitting
from sklearn.ensemble import RandomForestClassifier
rf_clf=RandomForestClassifier(n_estimators=20,criterion="entropy",random_state=5)
rf_clf.fit(X_train,Y_train)
y_pred_rf=rf_clf.predict(X_test)
accuracy_score(Y_test,y_pred_rf)

In [None]:
# train with Standard Scalar, (instead of X_train, fit on X_train_sc and X_test_sc, achieve by scaling)
rf_clf_sc=RandomForestClassifier(n_estimators=20,criterion="entropy",random_state=5)
rf_clf_sc.fit(X_train_sc,Y_train)
y_pred_rf_sc=rf_clf_sc.predict(X_test_sc)
accuracy_score(Y_test,y_pred_rf_sc)

In [None]:
# Adaboost classifier
from sklearn.ensemble import AdaBoostClassifier
abd_clf=AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy",random_state=20),
                                                  n_estimators=200,
                                                   learning_rate=0.1,
                                                   algorithm="SAMME",
                                                   random_state=1, )

abd_clf.fit(X_train,Y_train)
y_pred_abd=abd_clf.predict(X_test)
accuracy_score(Y_test,y_pred_abd)

In [None]:
# Train with Standard Scalar, fit on X_train_sc achieved by scaling
abd_clf_sc=AdaBoostClassifier(DecisionTreeClassifier(criterion="entropy",random_state=20),
                             n_estimators=200,
                             learning_rate=0.1,
                             algorithm="SAMME",
                             random_state=1,)
abd_clf_sc.fit(X_train_sc,Y_train)
y_pred_abd_sc=abd_clf_sc.predict(X_test_sc)
accuracy_score(Y_test,y_pred_abd_sc)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#train the model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, Y_train)

#predict
y_pred_xgb = xgb_model.predict(X_test)

#accuracy score
accuracy_xgb = accuracy_score(Y_test, y_pred_xgb)
print(f"XGBoost Accuracy (no scaling): {accuracy_xgb:.10f}")

In [None]:
xgb_model_scaled = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model_scaled.fit(X_train_sc, Y_train)

#predict
y_pred_xgb_scaled = xgb_model_scaled.predict(X_test_sc)

#accuracy
accuracy_xgb_scaled = accuracy_score(Y_test, y_pred_xgb_scaled)
print(f"XGBoost Accuracy (with scaling): {accuracy_xgb_scaled:.10f}")

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

#create the confusion matrix
cm = confusion_matrix(Y_test, y_pred_xgb)

In [None]:
#plotting the confusion matrix
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', linewidths=1, linecolor='black')

#labels and title
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')

plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(Y_test, y_pred_xgb, target_names=["Valid", "Fraud"]))