In [1]:
import pandas as pd
import numpy as np
import re
import openpyxl

# mengubah data transaksi list menjadi matriks biner (0 dan 1) agar bisa dibaca mesin
from mlxtend.preprocessing import TransactionEncoder

# # fpgrowth: algoritma pencari pola item; association_rules: Pembuat aturan "Jika-Maka"
from mlxtend.frequent_patterns import fpgrowth, association_rules

In [2]:
df_raw = pd.read_excel("./data/Data-Sintetis_Pengeluaran-Harian-Mahasiswa.xlsx")
df_raw.head(5)

Unnamed: 0,Tanggal,Jenis Pengeluaran,Label,Kategori,Nominal (IDR)
0,01/03/2025,Top Up E-Wallet,Hiburan Digital,Keinginan,25000
1,01/03/2025,Print Tugas,Pendidikan,Kebutuhan,17000
2,01/03/2025,Nasi Padang,Makanan,Kebutuhan,28000
3,02/03/2025,Sunscreen,Skincare,Keinginan,85000
4,02/03/2025,Hangout di Mall,Nongkrong,Keinginan,132000


# Filter Daset Keinginan

In [3]:
df_desireCategory = df_raw[df_raw['Kategori'] == "Keinginan"].copy()
df_desireCategory.head(5)

Unnamed: 0,Tanggal,Jenis Pengeluaran,Label,Kategori,Nominal (IDR)
0,01/03/2025,Top Up E-Wallet,Hiburan Digital,Keinginan,25000
3,02/03/2025,Sunscreen,Skincare,Keinginan,85000
4,02/03/2025,Hangout di Mall,Nongkrong,Keinginan,132000
5,02/03/2025,Moisturizer,Skincare,Keinginan,31000
7,03/03/2025,Kentang Goreng,Makanan,Keinginan,21000


# Preprocessing Data

## Data Cleaning

### Cek Missing Values

In [8]:
missing_values = df_desireCategory.isnull().sum()
print(missing_values)

Tanggal              0
Jenis Pengeluaran    0
Label                0
Kategori             0
Nominal (IDR)        0
dtype: int64


In [11]:
if missing_values.sum() > 0:
  df_missingValueClean = df_desireCategory.dropna()
  print(f"Data berhasil dibersihkan. {len(df_desireCategory) - len(df_missingValueClean)} baris dihapus.")

else:
  df_missingValueClean = df_desireCategory.copy()
  print("Tidak ada missing values")

Tidak ada missing values


In [12]:
df_missingValueClean.head(5)

Unnamed: 0,Tanggal,Jenis Pengeluaran,Label,Kategori,Nominal (IDR)
0,01/03/2025,Top Up E-Wallet,Hiburan Digital,Keinginan,25000
3,02/03/2025,Sunscreen,Skincare,Keinginan,85000
4,02/03/2025,Hangout di Mall,Nongkrong,Keinginan,132000
5,02/03/2025,Moisturizer,Skincare,Keinginan,31000
7,03/03/2025,Kentang Goreng,Makanan,Keinginan,21000


### Symbol Removal

In [6]:
def removal_symbols(text):

  if pd.isna(text):
    return text

  cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
  return cleaned_text

In [14]:
df_removalSymbolsClean = df_missingValueClean.copy()

In [15]:
df_removalSymbolsClean["Jenis Pengeluaran"] = df_removalSymbolsClean["Jenis Pengeluaran"].apply(removal_symbols)

In [17]:
df_removalSymbolsClean.head(5)

Unnamed: 0,Tanggal,Jenis Pengeluaran,Label,Kategori,Nominal (IDR)
0,01/03/2025,Top Up EWallet,Hiburan Digital,Keinginan,25000
3,02/03/2025,Sunscreen,Skincare,Keinginan,85000
4,02/03/2025,Hangout di Mall,Nongkrong,Keinginan,132000
5,02/03/2025,Moisturizer,Skincare,Keinginan,31000
7,03/03/2025,Kentang Goreng,Makanan,Keinginan,21000


### Standarisasi Teks (lowercase)

In [19]:
def textStandardization(df, targetColumn):
  
  """
    - mengubah teks menjadi huruf kecil
    - menghapus spasi di awal dan akhir teks
  """

  df_clean = df.copy()

  for column in targetColumn:
    if column in df_clean.columns:
      df_clean[column] = df_clean[column].str.lower().str.strip()
    
    else:
      print(f"Kolom '{column}' tidak ditemukan dalam DataFrame.")
      
  return df_clean

In [21]:
df_textStandardization = df_removalSymbolsClean.copy()

In [25]:
target_cols = ["Jenis Pengeluaran", "Label", "Kategori"]

In [26]:
df_textStandardization = textStandardization(df_textStandardization, target_cols)

In [27]:
df_textStandardization.head(10)

Unnamed: 0,Tanggal,Jenis Pengeluaran,Label,Kategori,Nominal (IDR)
0,01/03/2025,top up ewallet,hiburan digital,keinginan,25000
3,02/03/2025,sunscreen,skincare,keinginan,85000
4,02/03/2025,hangout di mall,nongkrong,keinginan,132000
5,02/03/2025,moisturizer,skincare,keinginan,31000
7,03/03/2025,kentang goreng,makanan,keinginan,21000
10,03/03/2025,masker wajah,skincare,keinginan,29000
11,04/03/2025,top up ewallet,hiburan digital,keinginan,74000
12,04/03/2025,top up game pubg,hiburan digital,keinginan,95000
14,04/03/2025,jus buah,minuman,keinginan,20000
15,04/03/2025,burger,makanan,keinginan,36000


## Metadata Extraction
-----

### Goals
- Membuat referensi/kamus yang menghubungkan Label dengan Jenis Pengeluaran

- Agar nanti saat hasil FP-Growth menunjukkan "makanan", kita bisa tahu apa aja Jenis Pengeluaran makanan

## Transaction Aggregation (Pengelompokan Keranjang)

## One-Hot Encoding (Transformasi Ke Matriks Biner)

# Frequent Pattern Mining (Algoritma FP-Growth)

# Creating Association Rules (Rule Generation)

# Visualisasi Report

---

- Interactive Network Graph
- The Recommendation Cards
- Sankey Diagram
- Bubble Chart Cluster
- Bar Chart "Top Pairs"