### **Problem 2: Sales Tax Compliance and Anomaly Detection**

The accounting department suspects some receipts may have incorrect tax calculations. Your tasks:

* Verify if `Tax` equals `Price × Quantity × 0.07`. Round to 2 decimals.
* Flag any entries where the computed tax doesn't match the stored value.
* Clean all invalid tax records by re-calculating them from `Price` and `Quantity`.
* Generate a summary: count of corrected entries, and average difference in tax.
* Split receipt numbers and count unique customers per store location.
* From those records, extract and standardize the store’s street name and zip code.

*Hint: A mismatch may come from rounding or bad input. Treat zip code as text.*

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
data = pd.read_csv('fila_heat_filament_sales_april2025.csv')

In [3]:
df = pd.DataFrame(data)

In [4]:
df.head(3)

Unnamed: 0,Date Purchased,Receipt Number,Customer Name,Customer Address,Phone Number,Email,Store Location,Product Name,Product Code,Bar Code,Material Name,Color,Weight,Supplier,Lot Number,Price,Quantity,Tax,Total Price
0,2025-04-01,1ff49b78-8946-4e85-b59c-de66bacfb3d0,Danielle Johnson,"3321 Brittany Bypass, North Jefferyhaven, 79408",8386379402,danielle.johnson@hotmail.com,"5423 Garcia Light, West Melanieview, 06196",Standard PLA Filament,PLA-792,6184960000000.0,PLA,Blue,500,3DFilaments,L5012,26.69,1,1.87,28.56
1,2025-04-01,434308bc-89fa-4a68-8fb5-d27bbeb79919,Tracie Wyatt,"64752 Kelly Skyway, Jacquelineland, 80341",+1-283-276-4835x0305,tracie.wyatt@yahoo.com,"1395 Diana Locks, Thomasberg, 32826",Flexible TPU Filament,TPU-338,9696530000000.0,TPU,Purple,500,ProtoPolymers,L1520,20.88,2,2.92,44.68
2,2025-04-01,52fbe43b-9954-4eb4-8025-7ad1eb2263dd,Eric Moore,"691 James Mountain, Tashatown, 89667",001-184-514-6270x4828,eric.moore@gmail.com,"489 Eric Track, New Stephanie, 70015",Flexible TPU Filament,TPU-325,7015430000000.0,TPU,Purple,1000,PrintPro,L4257,41.47,4,11.61,177.49


In [5]:
# 1. Verify if `Tax` equals `Price × Quantity × 0.07`. Round to 2 decimals.
price_qty_tax = df[['Price', 'Quantity', 'Tax']].copy()

In [6]:
price_qty_tax['Tax Rate'] = round(price_qty_tax['Tax']/(price_qty_tax['Price'] * price_qty_tax['Quantity']), 2)

In [7]:
price_qty_tax.head(3)

Unnamed: 0,Price,Quantity,Tax,Tax Rate
0,26.69,1,1.87,0.07
1,20.88,2,2.92,0.07
2,41.47,4,11.61,0.07


In [8]:
price_qty_tax[price_qty_tax['Tax Rate'] != 0.07].value_counts()

Price  Quantity  Tax   Tax Rate
31.21  2         5.0   0.08        1
32.70  4         10.0  0.08        1
46.96  1         4.0   0.09        1
Name: count, dtype: int64

In [9]:
# 2. Flag any entries where the computed tax doesn't match the stored value.
price_qty_tax['FLAGGED Tax Rate'] = price_qty_tax['Tax Rate'].apply(lambda x: 'FLAGGED' if x != 0.07 else '')

In [10]:
mask = price_qty_tax['FLAGGED Tax Rate'] == 'FLAGGED'

In [11]:
flagged_taxes = price_qty_tax[mask].copy()

In [12]:
flagged_taxes

Unnamed: 0,Price,Quantity,Tax,Tax Rate,FLAGGED Tax Rate
20,31.21,2,5.0,0.08,FLAGGED
30,32.7,4,10.0,0.08,FLAGGED
52,46.96,1,4.0,0.09,FLAGGED


In [13]:
index_ = price_qty_tax[mask].index

In [14]:
index_

Index([20, 30, 52], dtype='int64')

In [15]:
# 3. Clean all invalid tax records by re-calculating them from `Price` and `Quantity`.
price_qty_tax.loc[index_, 'Tax'] = price_qty_tax.loc[index_].apply(
    lambda row: round(row['Price'] * row['Quantity'] * 0.07, 2),
    axis=1
)

In [16]:
price_qty_tax['Tax Rate'] = round(price_qty_tax['Tax']/(price_qty_tax['Price'] * price_qty_tax['Quantity']), 2)

In [17]:
price_qty_tax['FLAGGED Tax Rate'] = price_qty_tax['Tax Rate'].apply(lambda x: 'FLAGGED' if x != 0.07 else '')

In [18]:
adjusted_tax = price_qty_tax.loc[index_].copy()

In [19]:
adjusted_tax

Unnamed: 0,Price,Quantity,Tax,Tax Rate,FLAGGED Tax Rate
20,31.21,2,4.37,0.07,
30,32.7,4,9.16,0.07,
52,46.96,1,3.29,0.07,


In [20]:
# 4. Generate a summary: count of corrected entries, and average difference in tax.
adjusted_tax.describe()

Unnamed: 0,Price,Quantity,Tax,Tax Rate
count,3.0,3.0,3.0,3.0
mean,36.956667,2.333333,5.606667,0.07
std,8.695115,1.527525,3.124297,0.0
min,31.21,1.0,3.29,0.07
25%,31.955,1.5,3.83,0.07
50%,32.7,2.0,4.37,0.07
75%,39.83,3.0,6.765,0.07
max,46.96,4.0,9.16,0.07


In [21]:
flagged_taxes.describe()

Unnamed: 0,Price,Quantity,Tax,Tax Rate
count,3.0,3.0,3.0,3.0
mean,36.956667,2.333333,6.333333,0.083333
std,8.695115,1.527525,3.21455,0.005774
min,31.21,1.0,4.0,0.08
25%,31.955,1.5,4.5,0.08
50%,32.7,2.0,5.0,0.08
75%,39.83,3.0,7.5,0.085
max,46.96,4.0,10.0,0.09


In [22]:
flagged_taxes['Tax'].count()

np.int64(3)

In [23]:
average_difference_taxes = round(abs(flagged_taxes['Tax'].mean() - adjusted_tax['Tax'].mean()), 2)

In [24]:
average_difference_taxes

np.float64(0.73)

In [25]:
# 5. Split receipt numbers and count unique customers per store location.
receipt_customer_store_location = df[['Receipt Number', 'Customer Name', 'Store Location']].copy()

In [26]:
receipt_customer_store_location['Receipt Number'] = receipt_customer_store_location['Receipt Number'].str.split('-')

In [27]:
receipt_customer_store_location.head(3)

Unnamed: 0,Receipt Number,Customer Name,Store Location
0,"[1ff49b78, 8946, 4e85, b59c, de66bacfb3d0]",Danielle Johnson,"5423 Garcia Light, West Melanieview, 06196"
1,"[434308bc, 89fa, 4a68, 8fb5, d27bbeb79919]",Tracie Wyatt,"1395 Diana Locks, Thomasberg, 32826"
2,"[52fbe43b, 9954, 4eb4, 8025, 7ad1eb2263dd]",Eric Moore,"489 Eric Track, New Stephanie, 70015"


In [28]:
unique_customers_per_store = (
    receipt_customer_store_location
    .groupby('Store Location')['Customer Name']
    .nunique()
    .sort_values(ascending=False)
)

In [29]:
unique_customers_per_store.head(3)

Store Location
9995 Noah Road, East Mark, 44560                    1
00000 Marcus Throughway, North Haileyfort, 13380    1
0042 Koch Turnpike, North Jennifer, 19197           1
Name: Customer Name, dtype: int64

In [30]:
receipt_customer_store_location.drop_duplicates(subset=['Store Location', 'Customer Name']).head(3)

Unnamed: 0,Receipt Number,Customer Name,Store Location
0,"[1ff49b78, 8946, 4e85, b59c, de66bacfb3d0]",Danielle Johnson,"5423 Garcia Light, West Melanieview, 06196"
1,"[434308bc, 89fa, 4a68, 8fb5, d27bbeb79919]",Tracie Wyatt,"1395 Diana Locks, Thomasberg, 32826"
2,"[52fbe43b, 9954, 4eb4, 8025, 7ad1eb2263dd]",Eric Moore,"489 Eric Track, New Stephanie, 70015"


In [31]:
# 6. From those records, extract and standardize the store’s street name and zip code.
customer_data = pd.read_csv('customer_info.csv')

In [32]:
customer_info = pd.DataFrame(customer_data)

In [33]:
customer_info.head(3)

Unnamed: 0,House Number,Street Name,City,Zip Code
0,3321,Brittany Bypass,North Jefferyhaven,79408
1,64752,Kelly Skyway,Jacquelineland,80341
2,691,James Mountain,Tashatown,89667


In [34]:
# pattern_street_name = r'([A-Za-z\s]+)'
pattern_street_name = r'^\d+\s+(.*?),'

In [35]:
pattern_zip_code = r'(\d{5})'

In [36]:
df['Customer Address'].str.extract(pattern_street_name).head(3)

Unnamed: 0,0
0,Brittany Bypass
1,Kelly Skyway
2,James Mountain


In [37]:
df['Customer Address'].astype(str).str.extract(pattern_zip_code).head(3)

Unnamed: 0,0
0,79408
1,64752
2,89667
