In [None]:
import numpy as np
from pathlib import Path
import re
import ast

import matplotlib.pyplot as plt
import seaborn as sns

from preprocessor import cleaner, outlier_handler, scaler

In [2]:
DATA_DIR = Path("../data")
RAW_DATA_DIR = DATA_DIR / "raw"
PROCESSED_DATA_DIR = DATA_DIR / "processed"

In [3]:
file = open(RAW_DATA_DIR / "creditcard.csv", "r", encoding="utf-8")

cols = {}
pattern = re.compile(r"""((?:[^,"']|"[^"]*"|'[^']*')+)""")

first_line = file.readline()
entries = pattern.findall(first_line.strip())

for entry in entries:
    cols[entry] = []

for line in file:
    entries = pattern.findall(line.strip())

    for i, entry in enumerate(entries):
        key = list(cols.keys())[i]
        cols[key].append(entry)

# 1. Làm sạch dữ liệu thô

## 1.1 Làm sạch tên cột

In [4]:
def clean_column_name(cols):
    new_cols = {}
    for key in cols.keys():
        new_key = Preprocessor.clean_string(key)
        new_cols[new_key] = cols[key]
    return new_cols


cols = clean_column_name(cols)

## 1.2 Chuyển tất cả các cột thành kiểu dữ liệu số

In [5]:
cols = Preprocessor.convert_cols_to_numeric(cols)
cols

{'time': array([0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.72788e+05,
        1.72788e+05, 1.72792e+05]),
 'v1': array([-1.35980713,  1.19185711, -1.35835406, ...,  1.91956501,
        -0.24044005, -0.53341252]),
 'v2': array([-0.07278117,  0.26615071, -1.34016307, ..., -0.30125385,
         0.53048251, -0.18973334]),
 'v3': array([ 2.53634674,  0.16648011,  1.77320934, ..., -3.24963981,
         0.70251023,  0.70333737]),
 'v4': array([ 1.37815522,  0.44815408,  0.37977959, ..., -0.55782812,
         0.68979917, -0.50627124]),
 'v5': array([-0.33832077,  0.06001765, -0.50319813, ...,  2.63051512,
        -0.37796113, -0.01254568]),
 'v6': array([ 0.46238778, -0.08236081,  1.80049938, ...,  3.0312601 ,
         0.62370772, -0.64961669]),
 'v7': array([ 0.23959855, -0.07880298,  0.79146096, ..., -0.29682653,
        -0.68617999,  1.57700625]),
 'v8': array([ 0.0986979 ,  0.08510165,  0.24767579, ...,  0.70841718,
         0.67914546, -0.41465041]),
 'v9': array([ 0.36378697, -0.25542

# 2. Chuẩn hoá dữ liệu (Normalization)

In [6]:
# dựa trên kết quả eda

cols_with_outliers = Preprocessor.get_cols_with_outliers(cols, threshold_percentage=10)
cols_with_outliers

['v27', 'v28', 'amount']

## 2.1 Xử lí các giá trị ngoại lai

In [7]:
for col in cols_with_outliers:
    print(f"Clipping outliers in column: {col}")

    print(f"Before clipping outliers: {Preprocessor.count_outliers(cols[col])}")

    cols[col] = Preprocessor.clip_outliers(
        col=cols[col],
        lower_percentile=1,
        upper_percentile=99,
    )

    print(f"After clipping outliers: {Preprocessor.count_outliers(cols[col])}")
    print()

for col in cols_with_outliers:
    print(f"Clipping outliers in column: {col}")

    print(f"Before clipping outliers: {Preprocessor.count_outliers(cols[col])}")

    # Clip theo IQR bounds
    cols[col] = Preprocessor.clip_outliers_by_iqr(cols[col])

    print(f"After clipping outliers: {Preprocessor.count_outliers(cols[col])}")
    print()

Clipping outliers in column: v27
Before clipping outliers: 39163
After clipping outliers: 39163

Clipping outliers in column: v28
Before clipping outliers: 30342
After clipping outliers: 30342

Clipping outliers in column: amount
Before clipping outliers: 31904
After clipping outliers: 31904

Clipping outliers in column: v27
Before clipping outliers: 39163
After clipping outliers: 0

Clipping outliers in column: v28
Before clipping outliers: 30342
After clipping outliers: 0

Clipping outliers in column: amount
Before clipping outliers: 31904
After clipping outliers: 0

