In [31]:
# load libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle

# local model helpers
from model_utils import imb_ratio

current_k_fold = 1

# set seed
SEED=current_k_fold**3

# pandas options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Pre-process Credit Card data

In [32]:
# file paths
full_data_path = "./creditcard.csv" # from Pozzolo et al. 2015, p. 7 

# col names data set
col_names = [
    "Time","V1","V2","V3","V4","V5","V6","V7","V8","V9","V10","V11","V12","V13","V14","V15","V16","V17","V18","V19","V20","V21","V22","V23","V24","V25","V26","V27","V28","Amount","label"
] # V1-V28 are PCA transformed features, but are anonymized due to confidentiality reasons and are not interpretable
# Label is 1 if fraud, 0 otherwise

# read full data set
df = pd.read_csv(full_data_path, names=col_names, index_col=0, header=0)

# drop duplicates  & time column
df.drop_duplicates(inplace=True)
df.drop("Time", axis=1, inplace=True)

IR = imb_ratio(df.label.value_counts())
print(f"Imbalance ratio in full data: {IR}")

# Split data sets into X, y respectively
X = df.drop("label", axis=1)
y = df["label"]

"""
Partition Dataset:
We stratify the split to ensure that the class distribution is preserved in the partitions.
"""

# First 80/20 split on original data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED, stratify=y)

# Then 50/50 split on test data for validation and test set.
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=SEED, stratify=y_test)

# get imbalance ratio for each data set
IR_train = imb_ratio(y_train.value_counts())
IR_val = imb_ratio(y_val.value_counts())
IR_test = imb_ratio(y_test.value_counts())

# print imbalance ratios. They should be (nearly) the same.
print(f"Imbalance ratio in training data: {IR_train}")
print(f"Imbalance ratio in validation data: {IR_val}")
print(f"Imbalance ratio in test data: {IR_test}")

# print number of samples in each data set
print(f"\nNumber of samples in training data: {len(y_train)}")
print(f"Number of samples in validation data: {len(y_val)}")
print(f"Number of samples in test data: {len(y_test)}")

# there are no missing values
print(f"\nMissing values in overall data: {df.isnull().sum().sum()}")


Imbalance ratio in full data: 598.84
Imbalance ratio in training data: 599.48
Imbalance ratio in validation data: 602.68
Imbalance ratio in test data: 590.1

Number of samples in training data: 226980
Number of samples in validation data: 28373
Number of samples in test data: 28373

Missing values in overall data: 0


### Export pre-processed data via serialization (pickle)

In [33]:
# serialize data with pickle
cc13_preprocessed = {
    "X": X,
    "y": y,
    "X_train": X_train,
    "y_train": y_train,
    
    "X_val": X_val,
    "y_val": y_val,
    
    "X_test": X_test,
    "y_test": y_test,
      
    "col_names": col_names,
}

with open(f'cc13_preprocessed_k{current_k_fold}.pkl', 'wb') as f:
    pickle.dump(cc13_preprocessed, f)
    print(f'Data serialized to cc13_preprocessed_k{current_k_fold}.pkl')   

Data serialized to cc13_preprocessed_k1.pkl
