In [1]:
import pandas as pd
import re

### **Uncleaned Data**


In [2]:
df = pd.read_csv('Customer Call List.csv')
df

Unnamed: 0,CustomerID,First_Name,Last_Name,Phone_Number,Address,Paying Customer,Do_Not_Contact,Not_Useful_Column
0,1001,Frodo,Baggins,123-545-5421,"123 Shire Lane, Shire",Yes,No,True
1,1002,Abed,Nadir,123/643/9775,93 West Main Street,No,Yes,False
2,1003,Walter,/White,7066950392,298 Drugs Driveway,N,,True
3,1004,Dwight,Schrute,123-543-2345,"980 Paper Avenue, Pennsylvania, 18503",Yes,Y,True
4,1005,Jon,Snow,876|678|3469,123 Dragons Road,Y,No,True
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,Yes,Yes,True
6,1007,Jeff,Winger,,1209 South Street,No,No,False
7,1008,Sherlock,Holmes,876|678|3469,98 Clue Drive,N,No,False
8,1009,Gandalf,,N/a,123 Middle Earth,Yes,,False
9,1010,Peter,Parker,123-545-5421,"25th Main Street, New York",Yes,No,True


#### Rename columns


In [3]:
df = df.rename(columns={'CustomerID': 'id'})
df = df.rename(columns=lambda col: col.lower().replace(' ', '_'))

#### Drop useless columns


In [4]:
df = df.drop(columns=['not_useful_column'])

#### Drop duplicates


In [5]:
df = df.drop_duplicates()

#### Format Yes/No values


In [6]:
def format_yn(value):
    # If it's already formatted then just do nothing
    if type(value) is bool:
        return value

    if type(value) is str and 'Y' in value.upper():
        return True
    else:
        return False


df['do_not_contact'] = df['do_not_contact'].map(format_yn)
df['paying_customer'] = df['paying_customer'].map(format_yn)

#### Handle missing values


In [7]:
def handle_missing_values(col: pd.Series):
    # Create a copy to make it immutable
    col = col.copy()

    na_indexes = col.loc[col.isna()].index.to_list()

    if col.dtype == 'object':
        # Blank na values
        col.loc[na_indexes] = ''

        # Blank other na values
        for i, value in enumerate(col):
            if 'N/A' in value.upper():
                col.iloc[i] = ''

    return col


df = df.apply(handle_missing_values, axis=0)

#### Clean names


In [8]:
def clean_name(name): return re.sub(r"[^a-zA-Z' ]", '', name)


df['first_name'] = df['first_name'].map(clean_name)
df['last_name'] = df['last_name'].map(clean_name)

#### Clean phone numbers


In [9]:
def clean_phone_number(phone):
    # If it's blank then just do nothing
    if phone != '':
        phone = re.sub(r"[^0-9]", '', phone)
        phone = f'{phone[:3]}-{phone[3:6]}-{phone[6:]}'

    return phone


df['phone_number'] = df['phone_number'].map(clean_phone_number)

#### Split addresses


In [10]:
# Run this cell only if addresses aren't split yet
if type(df.columns) is not pd.MultiIndex:
    columns = []

    for col in df.columns:

        if col == 'address':
            columns.append((col, 'street'))
            columns.append((col, 'state'))
            columns.append((col, 'zip'))
        else:
            columns.append((col, ''))

    newdf = pd.DataFrame(
        {},
        columns=pd.MultiIndex.from_tuples(columns)
    )

    for col in df.columns:
        if col != 'address':
            newdf[col] = df[col]
        else:
            # Split addresses
            address = df[col].str.split(',', n=2, expand=True)
            newdf[col, 'street'] = address[0]
            newdf[col, 'state'] = address[1]
            newdf[col, 'zip'] = address[2]

    # Fill 'na' values with blank
    newdf['address'] = newdf['address'].fillna('')

    # Re-assign df
    df = newdf

### **Cleaned Data**


In [11]:
df

Unnamed: 0_level_0,id,first_name,last_name,phone_number,address,address,address,paying_customer,do_not_contact
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,street,state,zip,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1001,Frodo,Baggins,123-545-5421,123 Shire Lane,Shire,,True,False
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,,,False,True
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,,,False,False
3,1004,Dwight,Schrute,123-543-2345,980 Paper Avenue,Pennsylvania,18503.0,True,True
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,,,True,False
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,,,True,True
6,1007,Jeff,Winger,,1209 South Street,,,False,False
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,,,False,False
8,1009,Gandalf,,,123 Middle Earth,,,True,False
9,1010,Peter,Parker,123-545-5421,25th Main Street,New York,,True,False


#### Convert the cleaned DataFrame to a CSV file


In [12]:
df.to_csv('Customer Call List (Cleaned).csv', index=False)

#### Display the converted CSV file


In [13]:
cleaned_df = pd.read_csv('Customer Call List (Cleaned).csv', header=[0, 1])

# Fill unnamed columns with blank
cleaned_df = cleaned_df.rename(
    columns=lambda col: '' if 'Unnamed:' in col else col
)

# Fill 'na' values with blank
cleaned_df = cleaned_df.fillna('')

cleaned_df

Unnamed: 0_level_0,id,first_name,last_name,phone_number,address,address,address,paying_customer,do_not_contact
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,street,state,zip,Unnamed: 8_level_1,Unnamed: 9_level_1
0,1001,Frodo,Baggins,123-545-5421,123 Shire Lane,Shire,,True,False
1,1002,Abed,Nadir,123-643-9775,93 West Main Street,,,False,True
2,1003,Walter,White,706-695-0392,298 Drugs Driveway,,,False,False
3,1004,Dwight,Schrute,123-543-2345,980 Paper Avenue,Pennsylvania,18503.0,True,True
4,1005,Jon,Snow,876-678-3469,123 Dragons Road,,,True,False
5,1006,Ron,Swanson,304-762-2467,768 City Parkway,,,True,True
6,1007,Jeff,Winger,,1209 South Street,,,False,False
7,1008,Sherlock,Holmes,876-678-3469,98 Clue Drive,,,False,False
8,1009,Gandalf,,,123 Middle Earth,,,True,False
9,1010,Peter,Parker,123-545-5421,25th Main Street,New York,,True,False
