In [1]:
# import library
import pandas as pd
import re

In [2]:
# import data
excel_file_path = 'data/DATA2024.xlsx'
excel_data = pd.read_excel(excel_file_path)
excel_data

Unnamed: 0,TANGGAL,NO TRANSAKSI,NAMA BARANG,QTY
0,2024-01-01,2401011010001,GULA ROSE BRAND 1KG,1
1,2024-01-01,2401011010002,RIZKI MG 850ML/900ML BTL,3
2,2024-01-01,2401011010003,ULTRA MIMI VNL 125ML,1
3,2024-01-01,2401011010003,BEAR BRAND 189ML,1
4,2024-01-01,2401011010003,TINI WINI BITI ASIN 20 GR,1
...,...,...,...,...
167618,2024-12-31,2412311020122,WALLS POPULAIRE CKL,1
167619,2024-12-31,2412311020123,THERMAL 80X50 TRUST PAPER,1
167620,2024-12-31,2412311020124,SAKINAH 600ML,1
167621,2024-12-31,2412311020124,WALLS PP RAINBOW P,1


# **Pre-processing**

### **1. Periksa Kolom**

In [3]:
def check_columns(df, expected_columns):
    """Cek apakah semua kolom yang dibutuhkan ada dalam DataFrame"""
    missing_columns = [col for col in expected_columns if col not in df.columns]
    if missing_columns:
        print(f"⚠️ Kolom berikut tidak ditemukan dalam data: {missing_columns}")
        return False
    print("✅ Format kolom sesuai.")
    return True

### **2. Periksa Missing Value** 

In [4]:
def check_missing_values(df):
    missing_counts = df.isnull().sum()
    total_missing = missing_counts.sum()

    if total_missing > 0:
        print(f"⚠️ Terdapat {total_missing} missing values dalam data!")
        return False

    print("✅ Tidak ada missing values.")
    return True

### **3. Ubah Tipe Data** 

In [5]:
def fix_data_types(df, expected_dtypes):
    for col, expected_dtype in expected_dtypes.items():
        if df[col].dtype != expected_dtype:
            print(f"🔄 Mengonversi kolom '{col}' dari {df[col].dtype} ke {expected_dtype}")
            try:
                if expected_dtype == 'datetime64[ns]':
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                elif expected_dtype == 'int64':
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
                elif expected_dtype == 'object':
                    df[col] = df[col].astype(str)
            except Exception as e:
                print(f"❌ Error saat mengonversi kolom '{col}': {e}")
    print("✅ Semua tipe data telah sesuai.")
    return df

### **4. Text Preprocessing**

In [6]:
def preprocess_description_column(df, column_name='NAMA BARANG'):

    def preprocess_text(text):
        if pd.isnull(text):  # Pastikan tidak memproses nilai NaN
            return text
        
        text = re.sub(r'\s{3,}.*', '', text)  # Hapus spasi double dan teks setelahnya
        text = re.sub(r'[^\w\s/]', ' ', text)  # Hapus tanda baca kecuali /
        text = ' '.join(word for word in text.split() if not re.search(r'\d{5,}', word))  # Hapus kata dengan >= 5 angka
        text = re.sub(r'\s+', ' ', text).strip()  # Hapus spasi ganda & trim
        
        return text
    
    df['NAMA BARANG'] = df[column_name].apply(preprocess_text)
    print("✅ Kolom 'NAMA BARANG' telah dibersihkan.")
    
    return df

### **Pemanggilan Fungsi** 

In [7]:
# Fungsi utama untuk memproses data
def checking_data(df):
    expected_columns = ['TANGGAL', 'NO TRANSAKSI', 'NAMA BARANG', 'QTY']
    expected_dtypes = {
        'TANGGAL': 'datetime64[ns]',
        'NO TRANSAKSI': 'int64',
        'NAMA BARANG': 'object',
        'QTY': 'int64'
    }

    # 1. Cek format kolom
    if not check_columns(df, expected_columns):
        return None  # Jika format kolom salah, hentikan proses

    # 2. Cek missing values
    if not check_missing_values(df):
        return None  # Jika ada missing values, hentikan proses

    # 3. Perbaiki tipe data
    df = fix_data_types(df, expected_dtypes)

    return df

df_check_fix = checking_data(excel_data)
df_final = preprocess_description_column(df_check_fix)

✅ Format kolom sesuai.
✅ Tidak ada missing values.
✅ Semua tipe data telah sesuai.
✅ Kolom 'NAMA BARANG' telah dibersihkan.


In [8]:
df_final

Unnamed: 0,TANGGAL,NO TRANSAKSI,NAMA BARANG,QTY
0,2024-01-01,2401011010001,GULA ROSE BRAND 1KG,1
1,2024-01-01,2401011010002,RIZKI MG 850ML/900ML BTL,3
2,2024-01-01,2401011010003,ULTRA MIMI VNL 125ML,1
3,2024-01-01,2401011010003,BEAR BRAND 189ML,1
4,2024-01-01,2401011010003,TINI WINI BITI ASIN 20 GR,1
...,...,...,...,...
167618,2024-12-31,2412311020122,WALLS POPULAIRE CKL,1
167619,2024-12-31,2412311020123,THERMAL 80X50 TRUST PAPER,1
167620,2024-12-31,2412311020124,SAKINAH 600ML,1
167621,2024-12-31,2412311020124,WALLS PP RAINBOW P,1


In [None]:
# Save dataframe
df_final.to_excel('data/final_data2024.xlsx', index=False)