In [1]:
import numpy as np
import pandas as pd
from feature_engine.datetime import DatetimeFeatures
from sklearn.compose import make_column_selector
from sklearn.pipeline import Pipeline

from src.column_data_frame_transformer import ColumnDataFrameTransformer
from src.column_selector import ColumnSelector
from src.encoder import OneHotDataFrameEncoder
from src.high_cardinality_dropper import HighCardinalityDroppper
from src.imputer import SimpleDataFrameImputer
from src.nan_dropper import NaNColumnsDropper
from src.replacer import Replacer
from src.scaler import StandardDataFrameScaler

## Load data set

In [2]:
car_insurance_df = pd.read_csv("./data/insurance_claims.csv")
car_insurance_df.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


In [3]:
pd.to_datetime(car_insurance_df["policy_bind_date"]).dt.to_period("M").unique()

<PeriodArray>
['2014-10', '2006-06', '2000-09', '1990-05', '2014-06', '2006-10', '2000-06',
 '1990-02', '1997-02', '2011-07',
 ...
 '2006-07', '1993-11', '1990-12', '2012-09', '2013-02', '1998-07', '1996-03',
 '2000-01', '2013-05', '2011-11']
Length: 286, dtype: period[M]

# Columns sets

In [4]:
excluded = [
    "policy_number",
    "insured_hobbies",
    "auto_make",
    "auto_model",
]
date_features = ["incident_date", "policy_bind_date"]
target_variable = ["fraud_reported"]


In [5]:
selected_columns = car_insurance_df.columns.difference(excluded).tolist()
print(selected_columns)
print(len(selected_columns))

['_c39', 'age', 'authorities_contacted', 'auto_year', 'bodily_injuries', 'capital-gains', 'capital-loss', 'collision_type', 'fraud_reported', 'incident_city', 'incident_date', 'incident_hour_of_the_day', 'incident_location', 'incident_severity', 'incident_state', 'incident_type', 'injury_claim', 'insured_education_level', 'insured_occupation', 'insured_relationship', 'insured_sex', 'insured_zip', 'months_as_customer', 'number_of_vehicles_involved', 'police_report_available', 'policy_annual_premium', 'policy_bind_date', 'policy_csl', 'policy_deductable', 'policy_state', 'property_claim', 'property_damage', 'total_claim_amount', 'umbrella_limit', 'vehicle_claim', 'witnesses']
36


In [6]:
categorical_columns = (
    car_insurance_df.select_dtypes(include=["object"])
    .columns.difference(excluded)
    .tolist()
)
print(categorical_columns)

['authorities_contacted', 'collision_type', 'fraud_reported', 'incident_city', 'incident_date', 'incident_location', 'incident_severity', 'incident_state', 'incident_type', 'insured_education_level', 'insured_occupation', 'insured_relationship', 'insured_sex', 'police_report_available', 'policy_bind_date', 'policy_csl', 'policy_state', 'property_damage']


In [7]:
numeric_columns = (
    car_insurance_df.select_dtypes(include=["int64", "float64"])
    .columns.difference(excluded)
    .tolist()
)
print(numeric_columns)

['_c39', 'age', 'auto_year', 'bodily_injuries', 'capital-gains', 'capital-loss', 'incident_hour_of_the_day', 'injury_claim', 'insured_zip', 'months_as_customer', 'number_of_vehicles_involved', 'policy_annual_premium', 'policy_deductable', 'property_claim', 'total_claim_amount', 'umbrella_limit', 'vehicle_claim', 'witnesses']


# Columns transformers

In [8]:
mapping_dict = dict(
    zip(selected_columns, [{"?": np.nan} for _ in selected_columns])
)
print(mapping_dict)
nan_corrector = Replacer(mapper=mapping_dict)

{'_c39': {'?': nan}, 'age': {'?': nan}, 'authorities_contacted': {'?': nan}, 'auto_year': {'?': nan}, 'bodily_injuries': {'?': nan}, 'capital-gains': {'?': nan}, 'capital-loss': {'?': nan}, 'collision_type': {'?': nan}, 'fraud_reported': {'?': nan}, 'incident_city': {'?': nan}, 'incident_date': {'?': nan}, 'incident_hour_of_the_day': {'?': nan}, 'incident_location': {'?': nan}, 'incident_severity': {'?': nan}, 'incident_state': {'?': nan}, 'incident_type': {'?': nan}, 'injury_claim': {'?': nan}, 'insured_education_level': {'?': nan}, 'insured_occupation': {'?': nan}, 'insured_relationship': {'?': nan}, 'insured_sex': {'?': nan}, 'insured_zip': {'?': nan}, 'months_as_customer': {'?': nan}, 'number_of_vehicles_involved': {'?': nan}, 'police_report_available': {'?': nan}, 'policy_annual_premium': {'?': nan}, 'policy_bind_date': {'?': nan}, 'policy_csl': {'?': nan}, 'policy_deductable': {'?': nan}, 'policy_state': {'?': nan}, 'property_claim': {'?': nan}, 'property_damage': {'?': nan}, 'to

In [9]:
column_selector = ColumnSelector(selected_columns=selected_columns)

In [10]:
nan_column_dropper = NaNColumnsDropper(threshold=0.4)

In [11]:
target_corrector = Replacer(
    mapper={target_variable[0]: {"Y": "Yes", "N": "No"}}
)

In [12]:
high_cardinality_droppper = HighCardinalityDroppper(
    threshold=0.9,
    exclude=date_features,
)

In [13]:
date_featurizer = DatetimeFeatures(
    variables=date_features,
    features_to_extract=["month", "day_of_week", "day_of_month", "day_of_year"]
)

In [14]:
numerical_imputer = SimpleDataFrameImputer(strategy="median")

In [15]:
categorical_imputer = SimpleDataFrameImputer(strategy="mode")

In [16]:
numeric_transformer = StandardDataFrameScaler()

In [17]:
one_hot_transformer = OneHotDataFrameEncoder()

# Data preprocessing steps

In [18]:
data_cleaner = Pipeline(
    steps=[
        ("nan corrector", nan_corrector),
        ("business rules selector", column_selector),
        ("nan column dropper", nan_column_dropper),
        ("target corrector", target_corrector),
        ("high cadrdinality dropper", high_cardinality_droppper),
    ]
)

In [19]:
output_df = data_cleaner.fit_transform(car_insurance_df)
output_df

Unnamed: 0,age,authorities_contacted,auto_year,bodily_injuries,capital-gains,capital-loss,collision_type,fraud_reported,incident_city,incident_date,...,policy_bind_date,policy_csl,policy_deductable,policy_state,property_claim,property_damage,total_claim_amount,umbrella_limit,vehicle_claim,witnesses
0,48,Police,2004,1,53300,0,Side Collision,Yes,Columbus,2015-01-25,...,2014-10-17,250/500,1000,OH,13020,YES,71610,0,52080,2
1,42,Police,2007,0,0,0,,Yes,Riverwood,2015-01-21,...,2006-06-27,250/500,2000,IN,780,,5070,5000000,3510,0
2,29,Police,2007,2,35100,0,Rear Collision,No,Columbus,2015-02-22,...,2000-09-06,100/300,2000,OH,3850,NO,34650,5000000,23100,3
3,41,Police,2014,1,48900,-62400,Front Collision,Yes,Arlington,2015-01-10,...,1990-05-25,250/500,2000,IL,6340,,63400,6000000,50720,2
4,44,,2009,0,66000,-46000,,No,Arlington,2015-02-17,...,2014-06-06,500/1000,1000,IL,650,NO,6500,6000000,4550,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,38,Fire,2006,0,0,0,Front Collision,No,Northbrook,2015-02-22,...,1991-07-16,500/1000,1000,OH,8720,YES,87200,0,61040,1
996,41,Fire,2015,2,70900,0,Rear Collision,No,Northbend,2015-01-24,...,2014-01-05,100/300,1000,IL,18080,YES,108480,0,72320,3
997,34,Police,1996,2,35100,0,Side Collision,No,Arlington,2015-01-23,...,2003-02-17,250/500,500,OH,7500,,67500,3000000,52500,3
998,62,Other,1998,0,0,0,Rear Collision,No,Arlington,2015-02-26,...,2011-11-18,500/1000,2000,IL,5220,,46980,5000000,36540,1


In [20]:
# set(selected_columns).difference(output_df.columns)

In [21]:
def update_column_list(columns_list, transformer):
    return [
        column
        for column in columns_list
        if column in transformer.get_columns()
    ]

In [22]:
column_transformer = ColumnDataFrameTransformer(
    transformers=[
        ("date_feature", date_featurizer, date_features),
        (
            "numerical_imputer",
            numerical_imputer,
            make_column_selector(dtype_include=np.int64),
        ),
        # ("categorical imputer", categorical_imputer, categorical_columns),
        # ("numeric scaler", numeric_transformer, numeric_columns),
        # ("categorical encoder", one_hot_transformer, categorical_columns),
    ],
)

In [23]:
preprocessor = Pipeline(
    [
        ("cleaner", data_cleaner),
        ("column_transformer", column_transformer),
    ]
)

In [24]:
output_df = preprocessor.fit_transform(car_insurance_df)
output_df

RecursionError: maximum recursion depth exceeded

In [None]:
set(selected_columns).difference(output_df.columns)