In [24]:
import kagglehub
import pandas as pd
import numpy as np

from IPython.display import Markdown as md

In [25]:
RAND = 42

In [26]:
def calc_abnormal_rate(y, round=0):
    return np.round((y.mean()), round)

def load_creditcard():
    path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
    path = f"{path}/creditcard.csv"
    data = pd.read_csv(path)
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    return X, y

In [27]:
data, target = load_creditcard()
unique, counts = np.unique(target, return_counts=True)

In [28]:
N_NORMAL = 25000
selected_idc = data[target.to_numpy() == 0].iloc[:N_NORMAL].index
data_normal = data.loc[selected_idc]
target_normal = target.loc[selected_idc]
data_normal_abnormal_rate = calc_abnormal_rate(target_normal)

In [29]:
N_NORMAL_ADDITIONAL = 5000
selected_idc = data[target.to_numpy() == 0].iloc[N_NORMAL:].sample(N_NORMAL_ADDITIONAL, random_state=RAND).index
data_additional_normal = data.loc[selected_idc]
target_additional_normal = target.loc[selected_idc]
data_additional_normal_abnormal_rate = calc_abnormal_rate(target_normal)

assert (len(set(data_normal.index).intersection(data_additional_normal.index)) == 0)

In [30]:
selected_idc = data[target.to_numpy() == 1].index
data_abnormal = data.loc[selected_idc]
target_abnormal = target.loc[selected_idc]
data_abnormal_abnormal_rate = calc_abnormal_rate(target_abnormal)

In [31]:
# Construct Quarto output

## Original dataset

In [32]:
#| output: true
print(f"The dataset has the following shape: {data.shape} / {target.shape}")

The dataset has the following shape: (284807, 30) / (284807,)


In [33]:
#| output: true
#| label: tbl-data
#| tbl-cap: "Predictors"
display(data)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,...,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,...,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,...,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00


In [34]:
#| output: true
#| label: tbl-target-unique-counts
#| tbl-cap: "Unique target values and counts"
pd.Series(counts, index=["Normal (0)", "Abnormal (1)"], name="count").to_frame().style \
	.format(thousands=",")

Unnamed: 0,count
Normal (0),284315
Abnormal (1),492


## Preprocessing

Collecting the following records in separate dataframes:

1. Normal observations from the beginning of the time-ordered data, for training
2. Additional randomly selected normal observations to hold out
3. All the abnormal observations to hold out

In [35]:
#| output: true
md(f"### Normal observations for training")

### Normal observations for training

In [36]:
#| output: true
#| label: tbl-data-normal
#| tbl-cap: "Normal observations"

display(data_normal)
display(md(f"<div style='background-color: #40c9a24D; padding: 1rem'>Abnormal class rate: {data_normal_abnormal_rate:.2%}</div>"))

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25083,33512.0,1.086018,-0.435632,-0.290246,-1.055857,0.248559,0.913232,-0.295144,0.509301,1.186803,...,-0.392583,0.130192,0.665504,-0.056260,-1.030128,0.552438,-0.432086,0.091953,-0.017752,1.00
25084,33513.0,-0.617200,0.336665,1.723695,-1.928427,-0.301154,-0.427726,0.169817,0.164416,-1.586744,...,0.175974,0.088898,0.183499,-0.049288,0.209785,-0.158160,-0.562962,0.333607,0.159949,7.95
25085,33513.0,1.291304,-0.607151,0.516463,-0.766320,-1.007609,-0.444194,-0.704371,0.097925,-0.962729,...,0.000237,0.041503,-0.067952,0.133799,0.182377,0.145284,-0.416937,0.009512,0.007952,19.95
25086,33513.0,1.276512,-0.612617,0.509852,-0.764929,-0.985441,-0.426368,-0.676299,0.082089,-1.005510,...,0.034571,0.049276,-0.050830,0.124063,0.203041,0.152193,-0.419270,0.009725,0.010038,27.85


<div style='background-color: #40c9a24D; padding: 1rem'>Abnormal class rate: 0.00%</div>

In [37]:
#| output: true
md(f"### Additional normal observations")

### Additional normal observations

In [38]:
#| output: true
#| label: tbl-data-normal-additional
#| tbl-cap: "Additional normal observations, randomly selected"

display(data_additional_normal)
display(md(f"<div style='background-color: #40c9a24D; padding: 1rem'>Abnormal class rate: {data_additional_normal_abnormal_rate:.2%}</div>"))

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
270215,163968.0,-0.329265,0.936263,-1.501363,-0.157131,3.434361,3.590494,0.772506,0.762990,-0.733110,...,0.123419,0.121151,0.508647,-0.224248,0.718017,0.073811,-0.369039,0.321240,0.249078,20.23
237723,149365.0,1.987844,-0.990349,-1.071402,-2.249845,-0.577344,-0.336126,-0.698753,0.061308,2.364275,...,-0.086090,0.287380,0.967960,0.029287,0.320236,0.040949,-0.772158,0.063625,-0.037944,43.99
69148,53290.0,-1.512849,1.591712,-1.035610,-1.230774,1.556277,3.337519,-0.432556,1.779576,-0.487230,...,0.230815,-0.270784,-0.965462,0.096365,0.976411,0.122179,0.102461,0.257855,0.092698,44.99
245394,152734.0,0.242185,0.887620,-0.309447,-0.591291,1.035734,-0.760411,1.129676,-0.280716,0.221198,...,0.051153,-0.407659,-0.891387,0.058176,0.451953,-0.457317,0.093062,0.052742,-0.141193,5.99
74221,55473.0,-0.407640,1.024358,1.558869,-0.029263,-0.111501,-0.908901,0.673450,-0.041586,-0.328826,...,0.064224,-0.234225,-0.630000,-0.021049,0.354486,-0.209896,0.072678,0.263135,0.119861,1.29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
170102,120016.0,-0.254285,0.022598,-1.205879,-3.205432,3.185703,3.021180,0.461720,0.570733,-1.402025,...,0.206502,-0.005822,-0.083932,-0.189200,0.596602,0.061300,-0.395335,-0.061841,-0.073239,25.00
122152,76453.0,1.325066,0.376343,-0.086458,0.459260,0.159826,-0.502476,0.110844,-0.179217,-0.000976,...,-0.007034,-0.343779,-0.954470,0.001541,-0.608061,0.367856,0.152343,-0.020030,0.022406,1.79
34758,37811.0,-0.881077,1.204745,1.748039,0.895185,-0.199700,-0.186360,0.437241,0.136618,-0.123505,...,0.250744,-0.146617,-0.130225,-0.072515,0.518046,-0.135993,-0.510301,0.373038,0.275097,5.66
229824,146068.0,2.056561,-0.050105,-1.061507,0.419619,-0.140627,-1.215369,0.196336,-0.342874,0.523066,...,-0.211789,-0.285717,-0.689385,0.337552,0.049620,-0.296644,0.194707,-0.070473,-0.059129,4.49


<div style='background-color: #40c9a24D; padding: 1rem'>Abnormal class rate: 0.00%</div>

In [39]:
#| output: true
md(f"### All the abnormal observations")

### All the abnormal observations

In [40]:
#| output: true
#| label: tbl-data-abnormal
#| tbl-cap: "All the abnormal observations"

display(data_abnormal)
display(md(f"<div style='background-color: #613dc14D; padding: 1rem'>Abnormal class rate: {data_abnormal_abnormal_rate:.2%}</div>"))

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.126911,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,2.102339,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00
4920,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.430022,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,-0.171608,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00
6329,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,0.009061,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
279863,169142.0,-1.927883,1.125653,-4.518331,1.749293,-1.566487,-2.010494,-0.882850,0.697211,-2.064945,...,1.252967,0.778584,-0.319189,0.639419,-0.294885,0.537503,0.788395,0.292680,0.147968,390.00
280143,169347.0,1.378559,1.289381,-5.004247,1.411850,0.442581,-1.326536,-1.413170,0.248525,-1.127396,...,0.226138,0.370612,0.028234,-0.145640,-0.081049,0.521875,0.739467,0.389152,0.186637,0.76
280149,169351.0,-0.676143,1.126366,-2.213700,0.468308,-1.120541,-0.003346,-2.234739,1.210158,-0.652250,...,0.247968,0.751826,0.834108,0.190944,0.032070,-0.739695,0.471111,0.385107,0.194361,77.89
281144,169966.0,-3.113832,0.585864,-5.399730,1.817092,-0.840618,-2.943548,-2.208002,1.058733,-1.632333,...,0.306271,0.583276,-0.269209,-0.456108,-0.183659,-0.328168,0.606116,0.884876,-0.253700,245.00


<div style='background-color: #613dc14D; padding: 1rem'>Abnormal class rate: 100.00%</div>

In [41]:
#| output: true
md("## Constructing the desired dataset")

## Constructing the desired dataset

In [42]:
X_train = data_normal.copy()
X_holdout = pd.concat((data_additional_normal.copy(), data_abnormal.copy()))
y_holdout = pd.concat((target_additional_normal.copy(), target_abnormal.copy()))

In [43]:
#| output: true
#| label: tbl-desired-dataset-details
#| tbl-cap: "Details of the obtained data sets"

df_set_info = pd.DataFrame(
    {
		'Name': ["Training set", "Holdout set"],
		'Description': ["For training a model", "For validation and/or test purposes"],
		'Length': [len(X_train), len(X_holdout)],
        'Characteristics': ["Only normal observations", f"{len(data_normal):,} normal and all {len(data_abnormal):,} abnormal observations"]
    },
    columns=['Name', 'Description', 'Length', 'Characteristics']).set_index('Name')

df_set_info.T.style \
	.format(thousands=",")

Name,Training set,Holdout set
Description,For training a model,For validation and/or test purposes
Length,25000,5492
Characteristics,Only normal observations,"25,000 normal and all 492 abnormal observations"


In [44]:
""" X_train.to_csv("data/X_train")
X_holdout.to_csv("data/X_holdout")
y_holdout.to_csv("data/y_holdout") """

' X_train.to_csv("data/X_train")\nX_holdout.to_csv("data/X_holdout")\ny_holdout.to_csv("data/y_holdout") '