### **Problem 1: Customer Location Normalization and Analysis**

You have been asked to standardize customer addresses and create a mailing list for a regional campaign. Your goals are:

* Extract house number, street name, city, and zip code from the full customer address.
* Identify duplicate customers based on full name, email, and zip code.
* Create a clean contact list without duplicates.
* Standardize zip codes to 5-digit strings and ensure all contain only digits.
* Find and report the top 3 cities with the most customers.
* Detect and remove any malformed or incomplete email addresses.

*Hint: You may need to clean zip codes using string operations and validate emails using pattern rules.*

In [36]:
import pandas as pd
import numpy as np
import re

In [37]:
data = pd.read_csv('fila_heat_filament_sales_april2025.csv')

In [38]:
df = pd.DataFrame(data)

In [39]:
df.head(3)

Unnamed: 0,Date Purchased,Receipt Number,Customer Name,Customer Address,Phone Number,Email,Store Location,Product Name,Product Code,Bar Code,Material Name,Color,Weight,Supplier,Lot Number,Price,Quantity,Tax,Total Price
0,2025-04-01,1ff49b78-8946-4e85-b59c-de66bacfb3d0,Danielle Johnson,"3321 Brittany Bypass, North Jefferyhaven, 79408",8386379402,danielle.johnson@hotmail.com,"5423 Garcia Light, West Melanieview, 06196",Standard PLA Filament,PLA-792,6184960000000.0,PLA,Blue,500,3DFilaments,L5012,26.69,1,1.87,28.56
1,2025-04-01,434308bc-89fa-4a68-8fb5-d27bbeb79919,Tracie Wyatt,"64752 Kelly Skyway, Jacquelineland, 80341",+1-283-276-4835x0305,tracie.wyatt@yahoo.com,"1395 Diana Locks, Thomasberg, 32826",Flexible TPU Filament,TPU-338,9696530000000.0,TPU,Purple,500,ProtoPolymers,L1520,20.88,2,2.92,44.68
2,2025-04-01,52fbe43b-9954-4eb4-8025-7ad1eb2263dd,Eric Moore,"691 James Mountain, Tashatown, 89667",001-184-514-6270x4828,eric.moore@gmail.com,"489 Eric Track, New Stephanie, 70015",Flexible TPU Filament,TPU-325,7015430000000.0,TPU,Purple,1000,PrintPro,L4257,41.47,4,11.61,177.49


In [40]:
# 1. Extract house number, street name, city, and zip code from the full customer address.
addresses = df['Customer Address']

In [41]:
addresses.head(3)

0    3321 Brittany Bypass, North Jefferyhaven, 79408
1          64752 Kelly Skyway, Jacquelineland, 80341
2               691 James Mountain, Tashatown, 89667
Name: Customer Address, dtype: object

In [42]:
customer_sep = []

In [43]:
for entry in addresses:
    try:
        house_city_zip = entry.split(', ')
        house_street = house_city_zip[0].split(' ', 1)
        house_number = house_street[0]
        street_name = house_street[1] if len(house_street) > 1 else ''
        city = house_city_zip[1] if len(house_city_zip) > 1 else ''
        zip_code = house_city_zip[2] if len(house_city_zip) > 2 else ''
        customer_sep.append([house_number, street_name, city, zip_code])
        
    except Exception as e:
        customer_sep.append([None, None, None, None])

In [44]:
customer_info = pd.DataFrame(
    customer_sep,
    columns=['House Number', 'Street Name', 'City', 'Zip Code']
)

In [45]:
customer_info.to_csv('customer_info.csv', index=False)

In [46]:
customer_info.head(3)

Unnamed: 0,House Number,Street Name,City,Zip Code
0,3321,Brittany Bypass,North Jefferyhaven,79408
1,64752,Kelly Skyway,Jacquelineland,80341
2,691,James Mountain,Tashatown,89667


In [47]:
# 2. Identify duplicate customers based on full name, email, and zip code.
customers = pd.DataFrame(
    {
        'Name': df['Customer Name'],
        'Email': df['Email'],
        'Zip Code': customer_info['Zip Code'],
    }
)

In [48]:
customers.head(3)

Unnamed: 0,Name,Email,Zip Code
0,Danielle Johnson,danielle.johnson@hotmail.com,79408
1,Tracie Wyatt,tracie.wyatt@yahoo.com,80341
2,Eric Moore,eric.moore@gmail.com,89667


In [49]:
customers.duplicated(subset=['Name', 'Email', 'Zip Code']).value_counts()

False    360
Name: count, dtype: int64

In [50]:
# 3. Create a clean contact list without duplicates.
contact_list = pd.DataFrame(df[['Customer Name', 'Customer Address', 'Phone Number', 'Email']])

In [51]:
contact_list.head(3)

Unnamed: 0,Customer Name,Customer Address,Phone Number,Email
0,Danielle Johnson,"3321 Brittany Bypass, North Jefferyhaven, 79408",8386379402,danielle.johnson@hotmail.com
1,Tracie Wyatt,"64752 Kelly Skyway, Jacquelineland, 80341",+1-283-276-4835x0305,tracie.wyatt@yahoo.com
2,Eric Moore,"691 James Mountain, Tashatown, 89667",001-184-514-6270x4828,eric.moore@gmail.com


In [52]:
contact_list.duplicated().value_counts()

False    360
Name: count, dtype: int64

In [53]:
# if there are duplicates
contact_list.drop_duplicates()

Unnamed: 0,Customer Name,Customer Address,Phone Number,Email
0,Danielle Johnson,"3321 Brittany Bypass, North Jefferyhaven, 79408",8386379402,danielle.johnson@hotmail.com
1,Tracie Wyatt,"64752 Kelly Skyway, Jacquelineland, 80341",+1-283-276-4835x0305,tracie.wyatt@yahoo.com
2,Eric Moore,"691 James Mountain, Tashatown, 89667",001-184-514-6270x4828,eric.moore@gmail.com
3,Lisa Spence,"227 Joseph Well, Brandtside, 99495",001-346-578-7133x1509,lisa.spence@hotmail.com
4,Rodney Owens,"010 Chandler Union, East Jamesside, 59301",(624)731-7810,rodney.owens@hotmail.com
...,...,...,...,...
355,Brian Wilson,"103 Linda Street, Spencerville, 59395",(417)268-0808x1571,brian.wilson@gmail.com
356,Jane Johnson,"625 Griffin Squares, Carlsonfort, 04391",180.533.9627,jane.johnson@yahoo.com
357,Sergio Tucker,"1400 Richards Plains, Port Jenniferview, 75758",5315204998,sergio.tucker@gmail.com
358,Todd Cisneros,"7943 Mariah Track, Samuelborough, 97096",001-899-499-1970x142,todd.cisneros@yahoo.com


In [54]:
# 4. Standardize zip codes to 5-digit strings and ensure all contain only digits.
zip_codes = customer_info['Zip Code']

In [55]:
zip_codes.shape

(360,)

In [56]:
pattern = r'([0-9]{5})'

In [57]:
zip_codes.astype(str).str.extract(pattern)[0]

0      79408
1      80341
2      89667
3      99495
4      59301
       ...  
355    59395
356    04391
357    75758
358    97096
359    04503
Name: 0, Length: 360, dtype: object

In [58]:
zip_codes_cleaned = zip_codes.fillna('00000')

In [59]:
zip_codes_cleaned.head(3)

0    79408
1    80341
2    89667
Name: Zip Code, dtype: object

In [60]:
# 5. Find and report the top 3 cities with the most customers.
cities = customer_info['City']

In [61]:
cities.head(3)

0    North Jefferyhaven
1        Jacquelineland
2             Tashatown
Name: City, dtype: object

In [62]:
cities.value_counts().head(3)

City
Port James       2
North Brianna    2
Williamsmouth    2
Name: count, dtype: int64

In [63]:
# 6. Detect and remove any malformed or incomplete email addresses.
emails = df['Email']

In [64]:
emails.head(3)

0    danielle.johnson@hotmail.com
1          tracie.wyatt@yahoo.com
2            eric.moore@gmail.com
Name: Email, dtype: object

In [65]:
email_pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'

In [66]:
# Validate emails (True = valid, False = malformed)
emails_mask = emails.astype(str).str.fullmatch(email_pattern, flags=re.IGNORECASE)

In [67]:
# filter valid emails only
emails_valid = emails[emails_mask]

In [68]:
emails_valid.shape

(360,)

In [69]:
emails_valid.head(3)

0    danielle.johnson@hotmail.com
1          tracie.wyatt@yahoo.com
2            eric.moore@gmail.com
Name: Email, dtype: object

In [70]:
# Filter original df for rows with valid emails
df_valid_emails = df[emails_mask]

In [71]:
df_valid_emails.head(3)

Unnamed: 0,Date Purchased,Receipt Number,Customer Name,Customer Address,Phone Number,Email,Store Location,Product Name,Product Code,Bar Code,Material Name,Color,Weight,Supplier,Lot Number,Price,Quantity,Tax,Total Price
0,2025-04-01,1ff49b78-8946-4e85-b59c-de66bacfb3d0,Danielle Johnson,"3321 Brittany Bypass, North Jefferyhaven, 79408",8386379402,danielle.johnson@hotmail.com,"5423 Garcia Light, West Melanieview, 06196",Standard PLA Filament,PLA-792,6184960000000.0,PLA,Blue,500,3DFilaments,L5012,26.69,1,1.87,28.56
1,2025-04-01,434308bc-89fa-4a68-8fb5-d27bbeb79919,Tracie Wyatt,"64752 Kelly Skyway, Jacquelineland, 80341",+1-283-276-4835x0305,tracie.wyatt@yahoo.com,"1395 Diana Locks, Thomasberg, 32826",Flexible TPU Filament,TPU-338,9696530000000.0,TPU,Purple,500,ProtoPolymers,L1520,20.88,2,2.92,44.68
2,2025-04-01,52fbe43b-9954-4eb4-8025-7ad1eb2263dd,Eric Moore,"691 James Mountain, Tashatown, 89667",001-184-514-6270x4828,eric.moore@gmail.com,"489 Eric Track, New Stephanie, 70015",Flexible TPU Filament,TPU-325,7015430000000.0,TPU,Purple,1000,PrintPro,L4257,41.47,4,11.61,177.49
