In [None]:
import numpy as np
from pathlib import Path
import re

import matplotlib.pyplot as plt
import seaborn as sns

from data_preparation.preprocessing.cleaner import Cleaner
from data_preparation.preprocessing.outlier_handler import OutlierHandler
from data_preparation.preprocessing.scaler import Scaler

In [2]:
DATA_DIR = Path("../data")
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [3]:
file = open(RAW_DATA_DIR / "creditcard.csv", "r", encoding="utf-8")

cols = {}
pattern = re.compile(r"""((?:[^,"']|"[^"]*"|'[^']*')+)""")

first_line = file.readline()
entries = pattern.findall(first_line.strip())

for entry in entries:
    cols[entry] = []

for line in file:
    entries = pattern.findall(line.strip())

    for i, entry in enumerate(entries):
        key = list(cols.keys())[i]
        cols[key].append(entry)

# 1. Làm sạch dữ liệu thô

## 1.1 Làm sạch tên cột

In [4]:
def clean_column_name(cols):
    new_cols = {}
    for key in cols.keys():
        new_key = Cleaner.clean_string(key)
        new_cols[new_key] = cols[key]
    return new_cols


cols = clean_column_name(cols)

## 1.2 Chuyển tất cả các cột thành kiểu dữ liệu số

In [5]:
cols = Cleaner.convert_cols_to_numeric(cols)
cols

{'time': array([0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.72788e+05,
        1.72788e+05, 1.72792e+05], shape=(284807,)),
 'v1': array([-1.35980713,  1.19185711, -1.35835406, ...,  1.91956501,
        -0.24044005, -0.53341252], shape=(284807,)),
 'v2': array([-0.07278117,  0.26615071, -1.34016307, ..., -0.30125385,
         0.53048251, -0.18973334], shape=(284807,)),
 'v3': array([ 2.53634674,  0.16648011,  1.77320934, ..., -3.24963981,
         0.70251023,  0.70333737], shape=(284807,)),
 'v4': array([ 1.37815522,  0.44815408,  0.37977959, ..., -0.55782812,
         0.68979917, -0.50627124], shape=(284807,)),
 'v5': array([-0.33832077,  0.06001765, -0.50319813, ...,  2.63051512,
        -0.37796113, -0.01254568], shape=(284807,)),
 'v6': array([ 0.46238778, -0.08236081,  1.80049938, ...,  3.0312601 ,
         0.62370772, -0.64961669], shape=(284807,)),
 'v7': array([ 0.23959855, -0.07880298,  0.79146096, ..., -0.29682653,
        -0.68617999,  1.57700625], shape=(284807,)),
 'v8':

# 2. Chuẩn hoá dữ liệu (Normalization)

In [6]:
# dựa trên kết quả eda

cols_with_outliers = OutlierHandler.get_cols_with_outliers(
    cols,
    threshold_percentage=10,
)
cols_with_outliers

['v27', 'v28', 'amount']

## 2.1 Xử lí các giá trị ngoại lai

In [7]:
for col in cols_with_outliers:
    print(f"Clipping outliers in column: {col}")

    print(f"Before clipping outliers: {OutlierHandler.count_outliers(cols[col])}")

    cols[col] = OutlierHandler.clip_outliers(
        col=cols[col],
        lower_percentile=1,
        upper_percentile=99,
    )

    print(f"After clipping outliers: {OutlierHandler.count_outliers(cols[col])}")
    print()

for col in cols_with_outliers:
    print(f"Clipping outliers in column: {col}")

    print(f"Before clipping outliers: {OutlierHandler.count_outliers(cols[col])}")

    # Clip theo IQR bounds
    cols[col] = OutlierHandler.clip_outliers_by_iqr(cols[col])

    print(f"After clipping outliers: {OutlierHandler.count_outliers(cols[col])}")
    print()

Clipping outliers in column: v27
Before clipping outliers: 39163
After clipping outliers: 39163

Clipping outliers in column: v28
Before clipping outliers: 30342
After clipping outliers: 30342

Clipping outliers in column: amount
Before clipping outliers: 31904
After clipping outliers: 31904

Clipping outliers in column: v27
Before clipping outliers: 39163
After clipping outliers: 0

Clipping outliers in column: v28
Before clipping outliers: 30342
After clipping outliers: 0

Clipping outliers in column: amount
Before clipping outliers: 31904
After clipping outliers: 0



## 2.2 Scale dữ liệu

In [8]:
cols_to_robust_scale = [
    "time",
    "amount",
]

for robust_scale_col in cols_to_robust_scale:
    print(f"Robust scaling column: {robust_scale_col}")
    cols[robust_scale_col] = Scaler.robust_scale_column(cols[robust_scale_col])
    print()

Robust scaling column: time

Robust scaling column: amount



### Lưu ý:
Trước khi chạy PCA transformation thì cột đó phải được scale vì vậy ta không cần phải scale các cột này như đã nói trong lúc eda

In [9]:
%%script true

cols_to_standard_scale = list(set(cols.keys()) - set(cols_to_robust_scale) - {"class"})
cols_to_standard_scale

for standard_scale_col in cols_to_standard_scale:
    print(f"Standard scaling column: {standard_scale_col}")
    cols[standard_scale_col] = Scaler.standardize_column(cols[standard_scale_col])
    print()

In [10]:
nums_rows_to_display = 5

for col in cols:
    print(f"First {nums_rows_to_display} entries in column '{col}':")
    print(cols[col][:nums_rows_to_display])
    print()

First 5 entries in column 'time':
[-0.99498349 -0.99498349 -0.99497175 -0.99497175 -0.99496   ]

First 5 entries in column 'v1':
[-1.35980713  1.19185711 -1.35835406 -0.96627171 -1.15823309]

First 5 entries in column 'v2':
[-0.07278117  0.26615071 -1.34016307 -0.18522601  0.87773675]

First 5 entries in column 'v3':
[2.53634674 0.16648011 1.77320934 1.79299334 1.54871785]

First 5 entries in column 'v4':
[ 1.37815522  0.44815408  0.37977959 -0.86329128  0.40303393]

First 5 entries in column 'v5':
[-0.33832077  0.06001765 -0.50319813 -0.01030888 -0.40719338]

First 5 entries in column 'v6':
[ 0.46238778 -0.08236081  1.80049938  1.24720317  0.09592146]

First 5 entries in column 'v7':
[ 0.23959855 -0.07880298  0.79146096  0.23760894  0.59294075]

First 5 entries in column 'v8':
[ 0.0986979   0.08510165  0.24767579  0.37743587 -0.27053268]

First 5 entries in column 'v9':
[ 0.36378697 -0.25542513 -1.51465432 -1.38702406  0.81773931]

First 5 entries in column 'v10':
[ 0.09079417 -0.1669

# 3. Lưu dữ liệu đã được xử lí

In [11]:
file = open(PROCESSED_DATA_DIR / "creditcard_processed.csv", "w", encoding="utf-8")
header = ",".join(cols.keys())
file.write(header + "\n")

for i in range(len(cols["class"])):
    row_entries = []
    for col in cols:
        row_entries.append(str(cols[col][i]))
    row = ",".join(row_entries)
    file.write(row + "\n")

file.close()