# Data Cleaning

This notebook loads the main transaction data and extracts only the essential columns for analysis.

In [1]:
import pandas as pd
import numpy as np

## Load Raw Data

In [2]:
# Load the main dataset
df_main = pd.read_csv('../Data/Datathon Dataset.xlsx - Data - Main.csv')
print(f"Original data shape: {df_main.shape}")
print(f"\nOriginal columns ({len(df_main.columns)}):")
print(df_main.columns.tolist())

Original data shape: (84528, 28)

Original columns (28):
['File Month', 'Name', 'Period', 'Account', 'Cost Ctr', 'Profit Ctr', 'WBS element', 'Ref.key (header) 1', 'Document Header Text', 'DocumentNo', 'Clrng doc.', 'Type', 'PK', 'Offst.acct', 'Name of offsetting account', 'Reference', 'Assignment', 'Pstng Date', 'Doc..Date', 'Amount in doc. curr.', 'Amt in loc.cur.', 'Curr.', 'LCurr', 'Category', 'Rate (USD)', 'Amount in USD', 'Posting Period', 'Category Index']


  df_main = pd.read_csv('../Data/Datathon Dataset.xlsx - Data - Main.csv')


## Select Essential Columns

In [3]:
# Define columns to keep
columns_to_keep = [
    'Name',
    'Period',
    'Account',
    'PK',
    'Offst.acct',
    'Name of offsetting account',
    'Pstng Date',
    'Doc..Date',
    'Amount in USD',
    'LCurr',
    'Category'
]

# Check if all columns exist
missing_cols = [col for col in columns_to_keep if col not in df_main.columns]
if missing_cols:
    print(f"⚠️ Missing columns: {missing_cols}")
else:
    print("✅ All columns found!")

✅ All columns found!


In [4]:
# Select only the essential columns
df_clean = df_main[columns_to_keep].copy()
print(f"Clean data shape: {df_clean.shape}")
print(f"\nColumns in clean dataset:")
print(df_clean.columns.tolist())

Clean data shape: (84528, 11)

Columns in clean dataset:
['Name', 'Period', 'Account', 'PK', 'Offst.acct', 'Name of offsetting account', 'Pstng Date', 'Doc..Date', 'Amount in USD', 'LCurr', 'Category']


## Clean Amount in USD Column

In [5]:
# Convert Amount in USD from string to numeric (remove commas)
df_clean['Amount in USD'] = df_clean['Amount in USD'].str.replace(',', '').astype(float)
print("✅ Converted 'Amount in USD' to numeric")
print(f"\nAmount in USD stats:")
print(df_clean['Amount in USD'].describe())

✅ Converted 'Amount in USD' to numeric

Amount in USD stats:
count    8.452800e+04
mean    -4.634487e+01
std      5.187387e+04
min     -3.415425e+06
25%     -3.649840e+01
50%     -1.533510e+01
75%     -4.003220e+00
max      3.018910e+06
Name: Amount in USD, dtype: float64


## Preview Clean Data

In [6]:
# Preview the clean data
print("Data types:")
print(df_clean.dtypes)
print("\nFirst 5 rows:")
display(df_clean.head())

Data types:
Name                           object
Period                          int64
Account                         int64
PK                              int64
Offst.acct                     object
Name of offsetting account     object
Pstng Date                     object
Doc..Date                      object
Amount in USD                 float64
LCurr                          object
Category                       object
dtype: object

First 5 rows:


Unnamed: 0,Name,Period,Account,PK,Offst.acct,Name of offsetting account,Pstng Date,Doc..Date,Amount in USD,LCurr,Category
0,TW10,8,19500100,50,19500102,House Bank 1 - Clearing - EFT Payments,8/1/2025,8/1/25 0:00,-13.8533,TWD,AP
1,TW10,8,19500100,50,19500102,House Bank 1 - Clearing - EFT Payments,8/1/2025,8/1/25 0:00,-15.80906,TWD,AP
2,TW10,8,19500100,50,19500102,House Bank 1 - Clearing - EFT Payments,8/1/2025,8/1/25 0:00,-14.99416,TWD,AP
3,TW10,8,19500100,50,19500102,House Bank 1 - Clearing - EFT Payments,8/1/2025,8/1/25 0:00,-11.89754,TWD,AP
4,TW10,8,19500100,50,19500102,House Bank 1 - Clearing - EFT Payments,8/1/2025,8/1/25 0:00,-15.15714,TWD,AP


In [7]:
# Check for missing values
print("Missing values:")
missing = df_clean.isnull().sum()
missing_pct = (missing / len(df_clean)) * 100
missing_df = pd.DataFrame({'Count': missing, 'Percentage': missing_pct})
display(missing_df)

Missing values:


Unnamed: 0,Count,Percentage
Name,0,0.0
Period,0,0.0
Account,0,0.0
PK,0,0.0
Offst.acct,0,0.0
Name of offsetting account,0,0.0
Pstng Date,0,0.0
Doc..Date,0,0.0
Amount in USD,0,0.0
LCurr,0,0.0


## Export Clean Data

In [9]:
# Save to processed_data folder
output_path = '../processed_data/clean_transactions.csv'
df_clean.to_csv(output_path, index=False)
print(f"✅ Clean data saved to: {output_path}")
print(f"   Rows: {len(df_clean):,}")
print(f"   Columns: {len(df_clean.columns)}")

✅ Clean data saved to: ../processed_data/clean_transactions.csv
   Rows: 84,528
   Columns: 11
