# Imports

In [1]:
try:
    import pandas as pd # for data
    import numpy as np # for num ops

    from sklearn.model_selection import train_test_split  # for splitting the data
    from sklearn.preprocessing import StandardScaler      # for scaling features like 'Amount'

    from imblearn.over_sampling import SMOTE  # for synthetic oversampling

    from sklearn.linear_model import LogisticRegression  # baseline model
    from sklearn.ensemble import RandomForestClassifier  # tree based model

    from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score  # for key metrics

    # for mainly visualization (optional but imported just for precaution)
    import matplotlib.pyplot as plt  # for plotting metrics and distributions
    import seaborn as sns            # for heatmaps and other plots
    print("all imports worked")
except ImportError as e:
    print(f"error: {e}")

Matplotlib is building the font cache; this may take a moment.


all imports worked


# Data

In [2]:
path = "creditcard.csv"
df = pd.read_csv(path)

In [3]:
print(df.shape) # rows and cols

(284807, 31)


In [4]:
print(df.head()) # checking the rows and cols

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

In [6]:
print(df['Class'].value_counts()) # checking the number of fraudulant transactions and non-fraudulant transactions

Class
0    284315
1       492
Name: count, dtype: int64


In [7]:
print(df.isnull()) # checking for missing values

         Time     V1     V2     V3     V4     V5     V6     V7     V8     V9  \
0       False  False  False  False  False  False  False  False  False  False   
1       False  False  False  False  False  False  False  False  False  False   
2       False  False  False  False  False  False  False  False  False  False   
3       False  False  False  False  False  False  False  False  False  False   
4       False  False  False  False  False  False  False  False  False  False   
...       ...    ...    ...    ...    ...    ...    ...    ...    ...    ...   
284802  False  False  False  False  False  False  False  False  False  False   
284803  False  False  False  False  False  False  False  False  False  False   
284804  False  False  False  False  False  False  False  False  False  False   
284805  False  False  False  False  False  False  False  False  False  False   
284806  False  False  False  False  False  False  False  False  False  False   

        ...    V21    V22    V23    V24

In [8]:
print(df.duplicated) # checking for any dupes (unlikely but good practice)

<bound method DataFrame.duplicated of             Time         V1         V2        V3        V4        V5  \
0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
...          ...        ...        ...       ...       ...       ...   
284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   

              V6        V7        V8        V9  ...       V21       V22  \
0       0.462388  0.23

# Splitting the Data and Training

In [9]:
X = df.drop(columns=['Class']) # drops the class col as its not used for predictions 
y = df['Class'] # setting y as the output/target variable 

# (80/20 test and training)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

## Training on the imbalanced set

In [14]:
# scaling the model amount and time might have values on a much larger scale the than V1-V28 PCA features. improves convergence (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(random_state=42, max_iter=7000) # Instantiating the model
model.fit(X_train, y_train) # training the model

### The Results after evaluating using a confusion matrix

In [16]:
y_pred = model.predict(X_test_scaled) # make predictions on test set
# Evaluate on test set: check confusion matrix, precision, recall, and F1.
print("confusion matrix:\n", confusion_matrix(y_test, y_pred))
'''
TN - True Negatives Non Fraud Correctly Pred as Fraud
FP - False Positives Non Fraud Incorrectly Pred as Fraud
FN - False Negatives Fraud Cases Incorrectly Pred as Non Fraud
TP - True Positives Fraud Cases Correctly Pred as Fraud
[[TN, FP],
 [FN, TP]]
'''

confusion matrix:
 [[56837    27]
 [   41    57]]




## Classification Report

In [20]:
print(classification_report(y_test, y_pred))

'''
predictied 100% of the non fraud cases
predicted 68% with precision, and 58% with recall, missing 42% of them
it struggles to detect fraud effectively
'''

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.68      0.58      0.63        98

    accuracy                           1.00     56962
   macro avg       0.84      0.79      0.81     56962
weighted avg       1.00      1.00      1.00     56962



## Calculating the ROC-AUC Score

In [21]:
y_prob = model.predict_proba(X_test_scaled)[:, 1] # getting probs for class 1 which is fraud

# calculation
roc_auc = roc_auc_score(y_test, y_prob)
print("roc-auc score:\n", roc_auc)

roc-auc score:
 0.9432776592629173


