In [12]:
import pandas as pd

dtype_map = {
    "customer_id": "int64",
    "email": "string",
    "first_name": "string",
    "last_name": "string",
    "phone": "string",
    "date_of_birth": "string",
    "gender": "string",
    "country": "string",
    "city": "string",
    "postal_code": "string",
    "address": "string",
    "registration_date": "string",
    "last_login": "string",
    "is_active": "string",
    "customer_segment": "string",
    "marketing_consent": "string",
}

df_customers = pd.read_csv("../data/raw/customers.csv", dtype=dtype_map)
df_customers.head()

Unnamed: 0.1,Unnamed: 0,customer_id,email,first_name,last_name,phone,date_of_birth,gender,country,city,postal_code,address,registration_date,last_login,is_active,customer_segment,marketing_consent
0,0,1,uthomas@example.net,Kayla,Smith,329-672-0449x89707,1948-05-20,F,Brazil,south michelle,70351,,2022-09-22T09:40:47.913063,2025-01-07T12:09:24.323425,True,Regular,True
1,1,2,caldwellandrew@example.com,Carolyn,Gray,(710)769-1350x042,1983-03-19,F,France,Lake Shaneville,65372,"247 Allison Overpass Suite 960 North Dillon, A...",2022-11-14T23:20:08.324356,2024-08-25T17:36:03.417619,True,Budget,False
2,2,3,jon32@example.org,Deborah,Martin,+1-694-797-6792,1940-05-23,M,USA,Lisaborough,32855,"61358 Mills Spur Lake Tiffany, MS 52073",2022-10-14T15:49:17.728606,2023-04-10T06:15:20.412198,True,budget,True
3,3,4,ksmith@example.org,Rachel,Ellison,,1969-12-21,M,Australia,south michaelborough,11530,undefined,2024-03-16T09:18:53.251566,2025-03-20T12:06:38.326651,True,undefined,False
4,4,5,andrewsjessica@example.net,Lisa,Phelps,001-800-821-0991,1940-05-18,M,Japan,Herrerabury,61228,"6184 King Trail Lake Trevor, IA 55611",2023-07-21T10:07:22.968497,2025-04-19T11:40:01.441135,True,Regular,True


Vamos a descartar las columnas que son innecesarias para nuestro análisis.

In [13]:
df_customers.drop(
    columns=[
        "Unnamed: 0",
        # "first_name", Keep it for the second query of the proyect
        "email",
        "last_name",
        "phone",
        "address",
    ],
    inplace=True,
)

In [14]:
df_customers_clean = df_customers.set_index("customer_id")

# DATES

for col in ["date_of_birth", "registration_date", "last_login"]:
    df_customers_clean[col] = df_customers_clean[col].str.strip()
    df_customers_clean[col] = pd.to_datetime(df_customers_clean[col], errors="coerce")

# BOOLEANS

for col in ["is_active", "marketing_consent"]:
    df_customers_clean[col] = df_customers_clean[col].str.strip().str.lower()
    df_customers_clean[col] = (
        df_customers_clean[col]
        .map({"true": True, "false": False, "1": True, "0": False})
        .astype("boolean")
    )

# NARROWED CATEGORIES

valid_segments = ["Regular", "Premium", "Budget"]
df_customers_clean["customer_segment"] = (
    df_customers_clean["customer_segment"].str.strip().str.title()
)
df_customers_clean["customer_segment"] = df_customers_clean["customer_segment"].where(
    df_customers_clean["customer_segment"].isin(valid_segments)
)

valid_genders = ["M", "F"]
df_customers_clean["gender"] = df_customers_clean["gender"].str.strip().str.upper()
df_customers_clean["gender"] = df_customers_clean["gender"].where(
    df_customers_clean["gender"].isin(valid_genders)
)

# CATEGORICAL

df_customers_clean["country"] = (
    df_customers_clean["country"].str.strip().str.title().astype("category")
)
df_customers_clean["city"] = (
    df_customers_clean["city"].str.strip().str.title().astype("category")
)

# STRINGS

df_customers_clean["postal_code"] = df_customers_clean["postal_code"].str.strip()
df_customers_clean["postal_code"] = df_customers_clean["postal_code"].where(
    df_customers_clean["postal_code"].str.match(r"^\d+$"), pd.NA
)
df_customers_clean["postal_code"] = df_customers_clean["postal_code"].astype("string")

df_customers_clean["first_name"] = df_customers_clean["first_name"].str.strip()

df_customers_clean.head()

Unnamed: 0_level_0,first_name,date_of_birth,gender,country,city,postal_code,registration_date,last_login,is_active,customer_segment,marketing_consent
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Kayla,1948-05-20,F,Brazil,South Michelle,70351,2022-09-22 09:40:47.913063,2025-01-07 12:09:24.323425,True,Regular,True
2,Carolyn,1983-03-19,F,France,Lake Shaneville,65372,2022-11-14 23:20:08.324356,2024-08-25 17:36:03.417619,True,Budget,False
3,Deborah,1940-05-23,M,Usa,Lisaborough,32855,2022-10-14 15:49:17.728606,2023-04-10 06:15:20.412198,True,Budget,True
4,Rachel,1969-12-21,M,Australia,South Michaelborough,11530,2024-03-16 09:18:53.251566,2025-03-20 12:06:38.326651,True,,False
5,Lisa,1940-05-18,M,Japan,Herrerabury,61228,2023-07-21 10:07:22.968497,2025-04-19 11:40:01.441135,True,Regular,True


Exportamos el dataset limpio para reutilizarlo.

In [15]:
df_customers_clean.to_pickle("../data/clean/customers.pkl")