# Data Cleaning

This notebook processes and cleans the transaction data to prepare it for FP-Growth mining and training the recommender model.

- **Cleaning steps**:
  - Remove canceled transactions (invoices starting with 'C')
  - Drop rows with missing values
  - Convert `InvoiceDate` to datetime
  - Normalize item names (lowercase, strip spaces, etc.)
- **Output**:
  - A cleaned CSV file (`clean_transactions.csv`) ready for analysis

In [1]:
import pandas as pd

In [None]:
df = pd.read_csv("../data/online_retail.csv", sep=",", encoding="utf-8")

FileNotFoundError: [Errno 2] No such file or directory: '../data/Online Retail.xlsx'

In [None]:
cleaned = (
    df.dropna(subset=["CustomerID", "Description"])
      .query("Quantity > 0 and UnitPrice > 0")
      .loc[~df["InvoiceNo"].astype(str).str.startswith("C")]
)

# Xuất dữ liệu transaction cho FP-Growth
transactions = (
    cleaned.groupby(["InvoiceNo", "CustomerID"])['Description']
        .apply(lambda items: ",".join(sorted(set(items.str.strip().str.lower()))))
        .reset_index()
        .rename(columns={"InvoiceNo": "transaction_id", "CustomerID": "user_id", "Description": "items"})
)
transactions.to_csv("../data/transactions_fpgrowth.csv", index=False)

# Xuất dữ liệu user-item cho Deep Learning
user_item = (
    cleaned[["CustomerID", "Description"]].drop_duplicates()
      .assign(Description=lambda col: col["Description"].str.strip().str.lower())
      .rename(columns={"CustomerID": "user_id", "Description": "item_id"})
)
user_item.to_csv("../data/user_item_dl.csv", index=False)