<a href="https://colab.research.google.com/github/maulidacy/Preprocessing-Data_Data-Mining/blob/main/Preprocessing_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import sys
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_classif


# LOAD DATASET

In [2]:
# ------------------------------
# 1) LOAD DATASET
# ------------------------------
# Kolom sesuai dokumentasi UCI (16 kolom: 15 fitur + 1 label)
COLUMN_NAMES = [
    "A1","A2","A3","A4","A5","A6","A7","A8","A9","A10","A11","A12","A13","A14","A15","label"
]

# Lokasi file lokal & URL UCI resmi
LOCAL_FILE = "crx.data"
UCI_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"

def load_dataset():
    """Load dataset dari lokal jika ada, jika tidak coba unduh dari UCI.
       Tanda missing pada file ini adalah '?' yang akan diubah menjadi NaN.
    """
    try:
        if os.path.exists(LOCAL_FILE):
            df = pd.read_csv(LOCAL_FILE, header=None, names=COLUMN_NAMES, na_values='?')
        else:
            df = pd.read_csv(UCI_URL, header=None, names=COLUMN_NAMES, na_values='?')
    except Exception as e:
        print("[ERROR] Gagal memuat dataset. Pastikan file 'crx.data' ada di direktori kerja atau koneksi internet tersedia.")
        print("Detail:", e)
        sys.exit(1)
    return df

df = load_dataset()


# EKSPLORASI DATA SINGKAT (EDA)

In [3]:
# ------------------------------
# 2) EKSPLORASI DATA SINGKAT (EDA)
# ------------------------------
print("\n=== SHAPE DATA ===")
print(df.shape)

print("\n=== 5 BARIS PERTAMA ===")
print(df.head())

print("\n=== INFO DATA ===")
print(df.info())

print("\n=== JUMLAH MISSING PER KOLOM ===")
print(df.isna().sum())

# Ringkasan statistik numerik saja (setelah coba coerce ke numeric untuk lihat gambaran awal)
numeric_preview = df.copy()
for col in ["A2","A3","A8","A11","A14","A15"]:
    numeric_preview[col] = pd.to_numeric(numeric_preview[col], errors='coerce')
print("\n=== DESKRIPSI NUMERIK (preview) ===")
print(numeric_preview[["A2","A3","A8","A11","A14","A15"]].describe())



=== SHAPE DATA ===
(690, 16)

=== 5 BARIS PERTAMA ===
  A1     A2     A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15 label
0  b  30.83  0.000  u  g  w  v  1.25  t   t    1   f   g  202.0    0     +
1  a  58.67  4.460  u  g  q  h  3.04  t   t    6   f   g   43.0  560     +
2  a  24.50  0.500  u  g  q  h  1.50  t   f    0   f   g  280.0  824     +
3  b  27.83  1.540  u  g  w  v  3.75  t   t    5   t   g  100.0    3     +
4  b  20.17  5.625  u  g  w  v  1.71  t   f    0   f   s  120.0    0     +

=== INFO DATA ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 

# PERSIAPAN TARGET & TIPE KOLOM

In [4]:
# ------------------------------
# 3) PERSIAPAN TARGET & TIPE KOLOM
# ------------------------------
# Label: '+' (approved) -> 1, '-' (rejected) -> 0
df['label'] = df['label'].map({'+': 1, '-': 0})

# Tetapkan kolom numerik & kategorikal sesuai dokumentasi
numeric_features = ["A2","A3","A8","A11","A14","A15"]
categorical_features = [c for c in df.columns if c not in numeric_features + ['label']]

# Pastikan kolom numerik bertipe float
for col in numeric_features:
    df[col] = pd.to_numeric(df[col], errors='coerce')


# SPLIT DATA (sebelum transformasi, agar tidak bocor)

In [5]:
# ------------------------------
# 4) SPLIT DATA (sebelum transformasi, agar tidak bocor)
# ------------------------------
X = df.drop(columns=['label'])
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\n=== SHAPE SPLIT ===")
print("X_train:", X_train.shape, "X_test:", X_test.shape)
print("y_train:", y_train.shape, "y_test:", y_test.shape)



=== SHAPE SPLIT ===
X_train: (552, 15) X_test: (138, 15)
y_train: (552,) y_test: (138,)


# PIPELINE PREPROCESSING

In [6]:
# ------------------------------
# 5) PIPELINE PREPROCESSING
# ------------------------------
# - Numerik: imputasi median + standard scaler
# - Kategorikal: imputasi modus + one-hot encoding
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# Fit transform pada train, transform pada test
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep  = preprocessor.transform(X_test)

print("\n=== SHAPE SETELAH PREPROCESSING ===")
print("X_train_prep:", X_train_prep.shape, "X_test_prep:", X_test_prep.shape)



=== SHAPE SETELAH PREPROCESSING ===
X_train_prep: (552, 46) X_test_prep: (138, 46)


# FEATURE SELECTION SEDERHANA (Opsional)

In [7]:
# ------------------------------
# 6) (OPSIONAL) FEATURE SELECTION SEDERHANA
# ------------------------------
# Contoh: pilih fitur terbaik berdasarkan mutual information.
# Ambil top_k = min(20, jumlah fitur) agar aman.
n_features = X_train_prep.shape[1]
top_k = min(20, n_features)

mi_scores = mutual_info_classif(X_train_prep, y_train, random_state=42)
# Indeks fitur berdasarkan skor MI (descending)
top_idx = np.argsort(mi_scores)[::-1][:top_k]

X_train_sel = X_train_prep[:, top_idx]
X_test_sel  = X_test_prep[:, top_idx]

print("\n=== FEATURE SELECTION ===")
print(f"Total fitur setelah OHE: {n_features}")
print(f"Ambil TOP-{top_k} fitur berdasarkan Mutual Information.")
print("Shape terpilih:", X_train_sel.shape, X_test_sel.shape)



=== FEATURE SELECTION ===
Total fitur setelah OHE: 46
Ambil TOP-20 fitur berdasarkan Mutual Information.
Shape terpilih: (552, 20) (138, 20)


# SIMPAN HASIL (opsional, memudahkan latihan)

In [9]:
# ------------------------------
# 7) SIMPAN HASIL (opsional, memudahkan latihan)
# ------------------------------
outdir = "preprocessed_output"
os.makedirs(outdir, exist_ok=True)

np.save(os.path.join(outdir, "X_train_prep.npy"), X_train_prep)
np.save(os.path.join(outdir, "X_test_prep.npy"),  X_test_prep)
np.save(os.path.join(outdir, "X_train_sel.npy"),  X_train_sel)
np.save(os.path.join(outdir, "X_test_sel.npy"),   X_test_sel)
y_train.to_csv(os.path.join(outdir, "y_train.csv"), index=False)
y_test.to_csv(os.path.join(outdir, "y_test.csv"), index=False)

# Simpan metadata kolom untuk referensi (fitur numerik, kategorikal, dan indeks fitur terpilih)
meta = {
    "numeric_features": numeric_features,
    "categorical_features": categorical_features,
    "selected_feature_indices": top_idx.tolist(),
    "train_shape_after_preprocessing": list(X_train_prep.shape),
    "test_shape_after_preprocessing": list(X_test_prep.shape)
}
with open(os.path.join(outdir, "metadata.json"), "w") as f:
    json.dump(meta, f, indent=2)

print("\nSelesai. File hasil preprocessing disimpan ke folder:", outdir)


Selesai. File hasil preprocessing disimpan ke folder: preprocessed_output
