## **Import libraries and data**

In [1]:
import numpy as np
import pandas as pd

## **Task: Preprocess the data for further analysis**

In [2]:
# Read data

data = pd.read_excel("Invoices-with-Merged-Categories-and-Merged-Amounts.xlsx")
data

Unnamed: 0,Order ID,Category,Amount
0,CA-2011-167199,Binders | Art | Phones | Fasteners | Paper,609.98 | 5.48 | 391.98 | 755.96 | 31.12
1,CA-2011-149020,Office Supplies | Furniture,2.98 | 51.94
2,CA-2011-131905,Office Supplies | Technology | Technology,7.2 | 42.0186 | 42.035
3,CA-2011-127614,Accessories | Tables | Binders,234.45 | 1256.22 | 17.46


### **The issues include:**

- Multiple categories in one cell: Categories are combined in a single cell, separated by |.
- Multiple amounts in one cell: Corresponding amounts are also combined in a single cell, separated by |.
- Misalignment: Each category must align with its corresponding amount.
- Duplicates: Some categories are repeated in a single row

### **Solution:**
- Split the Category column on `|` to get individual categories.
- Split the Amount column on `|` to get individual amounts.
- Create a row for each Order ID with its corresponding Category and Amount.
- Ensure the Amount column is in a numeric format for further analysis.
- Operations regarding duplicated categories.

In [3]:
# Split columns

# We will use assign 
# Split the data in box by "|" and assign it to existing column/plcae

data= data.assign(Category = data['Category'].str.split("|"), 
                  Amount = data['Amount'].str.split("|"))

# Explode values
# explode each row into column for "Category" and similarly "Amount"
 
data = data.explode("Category").explode("Amount")
data

Unnamed: 0,Order ID,Category,Amount
0,CA-2011-167199,Binders,609.98
0,CA-2011-167199,Binders,5.48
0,CA-2011-167199,Binders,391.98
0,CA-2011-167199,Binders,755.96
0,CA-2011-167199,Binders,31.12
0,CA-2011-167199,Art,609.98
0,CA-2011-167199,Art,5.48
0,CA-2011-167199,Art,391.98
0,CA-2011-167199,Art,755.96
0,CA-2011-167199,Art,31.12


In [None]:
# Convert Amount to numeric

data['Amount'] = pd.to_numeric(data['Amount'], errors= 'coerce')
data

Unnamed: 0,Order ID,Category,Amount
0,CA-2011-167199,Binders,609.98
0,CA-2011-167199,Binders,5.48
0,CA-2011-167199,Binders,391.98
0,CA-2011-167199,Binders,755.96
0,CA-2011-167199,Binders,31.12
0,CA-2011-167199,Art,609.98
0,CA-2011-167199,Art,5.48
0,CA-2011-167199,Art,391.98
0,CA-2011-167199,Art,755.96
0,CA-2011-167199,Art,31.12


In [5]:
# Reset Index

data.reset_index(drop= True, inplace= True)
data

Unnamed: 0,Order ID,Category,Amount
0,CA-2011-167199,Binders,609.98
1,CA-2011-167199,Binders,5.48
2,CA-2011-167199,Binders,391.98
3,CA-2011-167199,Binders,755.96
4,CA-2011-167199,Binders,31.12
5,CA-2011-167199,Art,609.98
6,CA-2011-167199,Art,5.48
7,CA-2011-167199,Art,391.98
8,CA-2011-167199,Art,755.96
9,CA-2011-167199,Art,31.12


In [6]:
# Check duplicated

data.duplicated().sum()

0

**Note:** There aren't any duplicated values in the dataset

In [7]:
# Save the transformed sheet in the original datasheet

with pd.ExcelWriter("Invoices-with-Merged-Categories-and-Merged-Amounts.xlsx", engine= 'openpyxl', mode='a') as writer:
    data.to_excel(writer, sheet_name= "Transformed_data", index= False)