In [1]:
# import library
import pandas as pd
import re

In [2]:
# import data
excel_file_path = 'DATA2023.xlsx'
excel_data = pd.read_excel(excel_file_path)
excel_data

Unnamed: 0,TANGGAL,NO TRANSAKSI,NAMA BARANG,QTY
0,2023-01-01,2301011000001,FORVITA MARG 200GR,2
1,2023-01-01,2301011000002,SASA SANTAN KLPA 65ML,3
2,2023-01-01,2301011000003,CHEERS 1500ML GREEN,1
3,2023-01-01,2301011000004,SUN KARA 65ML,1
4,2023-01-01,2301011000004,OREO PIKACHU 165.6GR,1
...,...,...,...,...
162975,2023-12-31,2312311020112,KONIDIN 4'S,2
162976,2023-12-31,2312311020113,GOLDA CAPPUCINO 200ML,1
162977,2023-12-31,2312311020113,FLORIDINA ORANGE 360ML,1
162978,2023-12-31,2312311020114,WALLS PP TRICO/48 65298,1


# **Pre-processing**

### **1. Periksa Kolom**

In [3]:
def check_columns(df, expected_columns):
    """Cek apakah semua kolom yang dibutuhkan ada dalam DataFrame"""
    missing_columns = [col for col in expected_columns if col not in df.columns]
    if missing_columns:
        print(f"⚠️ Kolom berikut tidak ditemukan dalam data: {missing_columns}")
        return False
    print("✅ Format kolom sesuai.")
    return True

### **2. Periksa Missing Value** 

In [4]:
def check_missing_values(df):
    missing_counts = df.isnull().sum()
    total_missing = missing_counts.sum()

    if total_missing > 0:
        print(f"⚠️ Terdapat {total_missing} missing values dalam data!")
        return False

    print("✅ Tidak ada missing values.")
    return True

### **3. Ubah Tipe Data** 

In [5]:
def fix_data_types(df, expected_dtypes):
    for col, expected_dtype in expected_dtypes.items():
        if df[col].dtype != expected_dtype:
            print(f"🔄 Mengonversi kolom '{col}' dari {df[col].dtype} ke {expected_dtype}")
            try:
                if expected_dtype == 'datetime64[ns]':
                    df[col] = pd.to_datetime(df[col], errors='coerce')
                elif expected_dtype == 'int64':
                    df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
                elif expected_dtype == 'object':
                    df[col] = df[col].astype(str)
            except Exception as e:
                print(f"❌ Error saat mengonversi kolom '{col}': {e}")
    print("✅ Semua tipe data telah sesuai.")
    return df

### **4. Text Preprocessing**

In [6]:
def preprocess_description_column(df, column_name='NAMA BARANG'):

    def preprocess_text(text):
        if pd.isnull(text):  # Pastikan tidak memproses nilai NaN
            return text
        
        text = re.sub(r'\s{3,}.*', '', text)  # Hapus spasi double dan teks setelahnya
        text = re.sub(r'[^\w\s/]', '', text)  # Hapus tanda baca kecuali /
        text = ' '.join(word for word in text.split() if not re.search(r'\d{5,}', word))  # Hapus kata dengan >= 5 angka
        text = re.sub(r'\s+', ' ', text).strip()  # Hapus spasi ganda & trim
        
        return text
    
    df['NAMA BARANG'] = df[column_name].apply(preprocess_text)
    print("✅ Kolom 'NAMA BARANG' telah dibersihkan.")
    
    return df

### **Pemanggilan Fungsi** 

In [7]:
# Fungsi utama untuk memproses data
def checking_data(df):
    expected_columns = ['TANGGAL', 'NO TRANSAKSI', 'NAMA BARANG', 'QTY']
    expected_dtypes = {
        'TANGGAL': 'datetime64[ns]',
        'NO TRANSAKSI': 'int64',
        'NAMA BARANG': 'object',
        'QTY': 'int64'
    }

    # 1. Cek format kolom
    if not check_columns(df, expected_columns):
        return None  # Jika format kolom salah, hentikan proses

    # 2. Cek missing values
    if not check_missing_values(df):
        return None  # Jika ada missing values, hentikan proses

    # 3. Perbaiki tipe data
    df = fix_data_types(df, expected_dtypes)

    return df

df_check_fix = checking_data(excel_data)
df_final = preprocess_description_column(df_check_fix)

✅ Format kolom sesuai.
✅ Tidak ada missing values.
✅ Semua tipe data telah sesuai.
✅ Kolom 'NAMA BARANG' telah dibersihkan.


In [8]:
df_final

Unnamed: 0,TANGGAL,NO TRANSAKSI,NAMA BARANG,QTY
0,2023-01-01,2301011000001,FORVITA MARG 200GR,2
1,2023-01-01,2301011000002,SASA SANTAN KLPA 65ML,3
2,2023-01-01,2301011000003,CHEERS 1500ML GREEN,1
3,2023-01-01,2301011000004,SUN KARA 65ML,1
4,2023-01-01,2301011000004,OREO PIKACHU 1656GR,1
...,...,...,...,...
162975,2023-12-31,2312311020112,KONIDIN 4S,2
162976,2023-12-31,2312311020113,GOLDA CAPPUCINO 200ML,1
162977,2023-12-31,2312311020113,FLORIDINA ORANGE 360ML,1
162978,2023-12-31,2312311020114,WALLS PP TRICO,1


### **1. Merging Data**

In [8]:
# df_list = [df for df in excel_sheets.values()]
# merged_df = pd.concat(df_list, ignore_index=True)
# merged_df 

Unnamed: 0,NO.TRANSAKSI,DESCRIPTION,QTY
0,2.301011e+12,MADU NSTR SUPER 650ML.,1
1,,STELLA MTC F/C 225ML.REF /12,1
2,,GELAS KOPI BOLA@50,2
3,,MIKA BX IV,3
4,2.301011e+12,LAGIE GOLD CM 75G.W/30 LG07507,2
...,...,...,...
21929,,INDOMIE SOTO MIE/40 SM,1
21930,,INDOMIE KARI AYAM/40 IKA,1
21931,,INDOMIE KALDU AYAM/40 KA75,3
21932,,SEDAAP MIE KOREAN SPCY40-20234,1


### **2. Handling Missing Values**

In [7]:
# print(merged_df.isnull().sum())

TANGGAL         0
NO TRANSAKSI    0
NAMA BARANG     0
QTY             0
dtype: int64


In [8]:
# merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   TANGGAL       162980 non-null  datetime64[ns]
 1   NO TRANSAKSI  162980 non-null  int64         
 2   NAMA BARANG   162980 non-null  object        
 3   QTY           162980 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 5.0+ MB


In [11]:
# merged_df['NO.TRANSAKSI'] = merged_df['NO.TRANSAKSI'].ffill()
# merged_df

Unnamed: 0,NO.TRANSAKSI,DESCRIPTION,QTY
0,2.301011e+12,MADU NSTR SUPER 650ML.,1
1,2.301011e+12,STELLA MTC F/C 225ML.REF /12,1
2,2.301011e+12,GELAS KOPI BOLA@50,2
3,2.301011e+12,MIKA BX IV,3
4,2.301011e+12,LAGIE GOLD CM 75G.W/30 LG07507,2
...,...,...,...
21929,2.312311e+12,INDOMIE SOTO MIE/40 SM,1
21930,2.312311e+12,INDOMIE KARI AYAM/40 IKA,1
21931,2.312311e+12,INDOMIE KALDU AYAM/40 KA75,3
21932,2.312311e+12,SEDAAP MIE KOREAN SPCY40-20234,1


### **3. Extract Features**

In [12]:
# print(merged_df.dtypes)

NO.TRANSAKSI    float64
DESCRIPTION      object
QTY              object
dtype: object


In [13]:
# def convert_to_datetime(transaction_no):
#     if pd.isna(transaction_no):
#         return None
#     transaction_no = str(transaction_no)
#     date_str = '20' + transaction_no[:6] 
#     return pd.to_datetime(date_str, format='%Y%m%d')

# merged_df['DATE'] = merged_df['NO.TRANSAKSI'].apply(convert_to_datetime)
# merged_df = merged_df[['NO.TRANSAKSI', 'DATE', 'DESCRIPTION', 'QTY']]
# merged_df

Unnamed: 0,NO.TRANSAKSI,DATE,DESCRIPTION,QTY
0,2.301011e+12,2023-01-01,MADU NSTR SUPER 650ML.,1
1,2.301011e+12,2023-01-01,STELLA MTC F/C 225ML.REF /12,1
2,2.301011e+12,2023-01-01,GELAS KOPI BOLA@50,2
3,2.301011e+12,2023-01-01,MIKA BX IV,3
4,2.301011e+12,2023-01-01,LAGIE GOLD CM 75G.W/30 LG07507,2
...,...,...,...,...
21929,2.312311e+12,2023-12-31,INDOMIE SOTO MIE/40 SM,1
21930,2.312311e+12,2023-12-31,INDOMIE KARI AYAM/40 IKA,1
21931,2.312311e+12,2023-12-31,INDOMIE KALDU AYAM/40 KA75,3
21932,2.312311e+12,2023-12-31,SEDAAP MIE KOREAN SPCY40-20234,1


### **4. Text Preprocessing**

In [14]:
# def preprocess_description(text):
#     # Hapus spasi double dan teks setelahnya
#     text = re.sub(r'\s{1,}.*', '', text)
#     # Hapus teks setelah tanda '/'
#     text = re.sub(r'/.*', '', text)
#     # Hapus tanda baca
#     text = re.sub(r'[^\w\s]', '', text)
#     # Hapus kata dengan lebih dari atau sama dengan 5 angka
#     text = ' '.join(word for word in text.split() if not re.search(r'\d{5,}', word))
#     # Hapus spasi ganda
#     text = re.sub(r'\s+', ' ', text)
#     # Hapus spasi di awal dan akhir kalimat
#     text = text.strip()
    
#     return text

# merged_df["DESCRIPTION_CLEANED"] = merged_df["DESCRIPTION"].apply(preprocess_description)
# merged_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_df["DESCRIPTION_CLEANED"] = merged_df["DESCRIPTION"].apply(preprocess_description)


Unnamed: 0,NO.TRANSAKSI,DATE,DESCRIPTION,QTY,DESCRIPTION_CLEANED
0,2.301011e+12,2023-01-01,MADU NSTR SUPER 650ML.,1,MADU NSTR SUPER 650ML
1,2.301011e+12,2023-01-01,STELLA MTC F/C 225ML.REF /12,1,STELLA MTC F
2,2.301011e+12,2023-01-01,GELAS KOPI BOLA@50,2,GELAS KOPI BOLA50
3,2.301011e+12,2023-01-01,MIKA BX IV,3,MIKA BX IV
4,2.301011e+12,2023-01-01,LAGIE GOLD CM 75G.W/30 LG07507,2,LAGIE GOLD CM 75GW
...,...,...,...,...,...
21929,2.312311e+12,2023-12-31,INDOMIE SOTO MIE/40 SM,1,INDOMIE SOTO MIE
21930,2.312311e+12,2023-12-31,INDOMIE KARI AYAM/40 IKA,1,INDOMIE KARI AYAM
21931,2.312311e+12,2023-12-31,INDOMIE KALDU AYAM/40 KA75,3,INDOMIE KALDU AYAM
21932,2.312311e+12,2023-12-31,SEDAAP MIE KOREAN SPCY40-20234,1,SEDAAP MIE KOREAN


# **Save Final Data**

In [15]:
# final_df = merged_df[['NO.TRANSAKSI', 'DATE', 'DESCRIPTION_CLEANED', 'QTY']]
# final_df

Unnamed: 0,NO.TRANSAKSI,DATE,DESCRIPTION_CLEANED,QTY
0,2.301011e+12,2023-01-01,MADU NSTR SUPER 650ML,1
1,2.301011e+12,2023-01-01,STELLA MTC F,1
2,2.301011e+12,2023-01-01,GELAS KOPI BOLA50,2
3,2.301011e+12,2023-01-01,MIKA BX IV,3
4,2.301011e+12,2023-01-01,LAGIE GOLD CM 75GW,2
...,...,...,...,...
21929,2.312311e+12,2023-12-31,INDOMIE SOTO MIE,1
21930,2.312311e+12,2023-12-31,INDOMIE KARI AYAM,1
21931,2.312311e+12,2023-12-31,INDOMIE KALDU AYAM,3
21932,2.312311e+12,2023-12-31,SEDAAP MIE KOREAN,1


In [16]:
# final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21934 entries, 0 to 21933
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   NO.TRANSAKSI         21934 non-null  float64       
 1   DATE                 21934 non-null  datetime64[ns]
 2   DESCRIPTION_CLEANED  21934 non-null  object        
 3   QTY                  21934 non-null  object        
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 685.6+ KB


In [17]:
# final_df.to_excel('final_data.xlsx', index=False)