In [1]:
import sys
import os

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)


from src.cleaning.cleaning import DataCleaner
from src.schema import ALLOWED_GENDERS, ALLOWED_COUNTRIES, ALLOWED_PLANS, ALLOWED_CHANNELS, ALLOWED_ISSUE_TYPES, customers_schema, support_schema, transactions_schema


In [2]:
# Initialize cleaner
cleaner = DataCleaner()

# Chain operations for each dataset
(
    cleaner
    .drop_duplicates("customers", "customer_id")
    .convert_date("customers", "signup_date")
    .filter_categorical("customers", {
        "country": ALLOWED_COUNTRIES,
        "plan_type": ALLOWED_PLANS, 
        "gender": ALLOWED_GENDERS,
    })
    .convert_to_int("customers", "age")
)

(
    cleaner
    .convert_date("transactions", "date")
    .remove_rows("transactions", "customer_id == 'C999999'")
    .remove_negatives("transactions")
)

(
    cleaner
    .filter_categorical("support_interactions", {
        "channel": ALLOWED_CHANNELS,
        "issue_type": ALLOWED_ISSUE_TYPES,
    })
    .remove_negatives("support_interactions")
    .drop_duplicates("support_interactions", "ticket_id")
    .convert_date("support_interactions", "timestamp")
    .convert_to_int("support_interactions", "resolution_time_min")
    .convert_to_int("support_interactions", "was_resolved")
)

C:\Programming\Ironhack\projects\Customer-Churn-Prediction\data\1_raw\customers.csv
C:\Programming\Ironhack\projects\Customer-Churn-Prediction\data\1_raw\support_interactions.csv
C:\Programming\Ironhack\projects\Customer-Churn-Prediction\data\1_raw\transactions.csv


<src.cleaning.cleaning.DataCleaner at 0x1dae654a4b0>

In [58]:
# Save all data
cleaner.save_df("customers", "customers_clean")
cleaner.save_df("transactions", "transactions_clean")
cleaner.save_df("support_interactions", "support_interactions_clean")

In [4]:
print(cleaner.get_cleaning_summary())

                                            operation  \
0                Removed 20 duplicates from customers   
1      Converted signup_date to datetime in customers   
2   Filtered 137 invalid values from country in cu...   
3   Filtered 596 invalid values from plan_type in ...   
4   Filtered 599 invalid values from gender in cus...   
5               Converted age to integer in customers   
6          Converted date to datetime in transactions   
7   Removed 60 rows from transactions where custom...   
8   Removed 3735 rows with negative values from tr...   
9   Filtered 491 invalid values from channel in su...   
10  Filtered 1026 invalid values from issue_type i...   
11  Removed 1005 rows with negative values from su...   
12    Removed 12 duplicates from support_interactions   
13  Converted timestamp to datetime in support_int...   
14  Converted resolution_time_min to integer in su...   
15  Converted was_resolved to integer in support_i...   
16                    Validatio