<a href="https://colab.research.google.com/github/maulidacy/Preprocessing-Data_Data-Mining/blob/main/Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [66]:
# pandas & numpy buat olah data
# sklearn buat preprocessing, encoding, scaling, dan split data
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Load Dataset

In [67]:
# Dataset ada di UCI, link langsung ke file crx.data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"

# Nama kolom sesuai dokumentasi UCI (A1..A15 = fitur, Class = target)
cols = [f"A{i}" for i in range(1,16)] + ["Class"]

# Baca dataset. Tanda '?' dianggap missing value (NaN)
df = pd.read_csv(url, header=None, names=cols, na_values='?')

# Cek ukuran data, beberapa baris awal, dan jumlah missing
print("Ukuran dataset:", df.shape)
print("\n5 baris pertama:")
print(df.head())
print("\nJumlah missing value per kolom:")
print(df.isna().sum())

Ukuran dataset: (690, 16)

5 baris pertama:
  A1     A2     A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15 Class
0  b  30.83  0.000  u  g  w  v  1.25  t   t    1   f   g  202.0    0     +
1  a  58.67  4.460  u  g  q  h  3.04  t   t    6   f   g   43.0  560     +
2  a  24.50  0.500  u  g  q  h  1.50  t   f    0   f   g  280.0  824     +
3  b  27.83  1.540  u  g  w  v  3.75  t   t    5   t   g  100.0    3     +
4  b  20.17  5.625  u  g  w  v  1.71  t   f    0   f   s  120.0    0     +

Jumlah missing value per kolom:
A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
Class     0
dtype: int64


# Handle Missing Values

In [68]:
# Di dataset ini ada beberapa kolom yang numerik dan sisanya kategorikal
numeric_cols = ["A2","A3","A8","A11","A14","A15"]
categorical_cols = [c for c in df.columns if c not in numeric_cols + ["Class"]]

# Kadang kolom numerik kebaca string, jadi dipaksa ke float
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("Missing value sebelum imputasi:\n", df.isna().sum())

# Isi missing: numerik -> median (biar tahan sama outlier)
# kategorikal -> modus (nilai yg paling sering muncul)
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())

for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

print("\nMissing value sesudah imputasi:\n", df.isna().sum())

Missing value sebelum imputasi:
 A1       12
A2       12
A3        0
A4        6
A5        6
A6        9
A7        9
A8        0
A9        0
A10       0
A11       0
A12       0
A13       0
A14      13
A15       0
Class     0
dtype: int64

Missing value sesudah imputasi:
 A1       0
A2       0
A3       0
A4       0
A5       0
A6       0
A7       0
A8       0
A9       0
A10      0
A11      0
A12      0
A13      0
A14      0
A15      0
Class    0
dtype: int64


# Handle Outlier

In [69]:
# Outlier dibatasi pake metode IQR (interquartile range)
# Jadi kita ambil Q1, Q3, terus nilai yg kelewat jauh dipotong ke batas bawah/atas
for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    before_outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    df[col] = np.clip(df[col], lower, upper)  # nilai ekstrim dipotong
    after_outliers = ((df[col] < lower) | (df[col] > upper)).sum()
    print(f"{col}: {before_outliers} nilai dicap, tersisa {after_outliers} outlier")

A2: 18 nilai dicap, tersisa 0 outlier
A3: 17 nilai dicap, tersisa 0 outlier
A8: 63 nilai dicap, tersisa 0 outlier
A11: 79 nilai dicap, tersisa 0 outlier
A14: 13 nilai dicap, tersisa 0 outlier
A15: 113 nilai dicap, tersisa 0 outlier


# Split Data Train & Test

In [70]:
# Target (Class) berisi '+' dan '-', diubah jadi angka agar lebih mudah diproses
# '+' artinya approved (1), '-' artinya rejected (0)
y = df["Class"].map({"+":1, "-":0})
X = df.drop(columns=["Class"])

# Bagi dataset jadi 80% train, 20% test
# stratify=y agar proporsi kelas seimbang antara train dan test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Ukuran data train:", X_train.shape)
print("Ukuran data test :", X_test.shape)
print("Distribusi kelas (train):")
print(y_train.value_counts(normalize=True))

Ukuran data train: (552, 15)
Ukuran data test : (138, 15)
Distribusi kelas (train):
Class
0    0.554348
1    0.445652
Name: proportion, dtype: float64


# Encode Categorical & Scale Numeric Features

In [71]:
# Supaya data bisa dipakai model ML, kita encode kolom kategorikal jadi angka
# (pakai one-hot encoding). Sementara kolom numerik kita standarkan biar skalanya seragam.
# StandardScaler bikin rata-rata = 0, std = 1
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), categorical_cols)
    ]
)

# Fit pakai train, terus transform ke train & test
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

print("Shape setelah preprocessing (train):", X_train_proc.shape)
print("Shape setelah preprocessing (test) :", X_test_proc.shape)

# Liat 5 baris pertama hasil preprocessing
print("\nPreview 5 baris pertama (array):")
print(X_train_proc[:5].toarray() if hasattr(X_train_proc, "toarray") else X_train_proc[:5])

Shape setelah preprocessing (train): (552, 37)
Shape setelah preprocessing (test) : (138, 37)

Preview 5 baris pertama (array):
[[-0.46914938  0.87158219 -0.17371658 -0.68058115 -0.11090688 -0.25207941
   0.          1.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          1.          0.          0.          0.
   0.        ]
 [-0.21320063 -0.92300003 -0.73737193 -0.68058115  0.30812527 -0.27884697
   1.          1.          0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          0.
   0.        ]
 [-0.94229409 -0.89594059 -0.38164