# Imports

In [26]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pandas as pd

df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df_train.columns = df_train.columns.str.replace(' ','_')
df_test.columns = df_test.columns.str.replace(' ','_')

# Date format

In [27]:
df_train['Policy_Start_Date'] = pd.to_datetime(df_train['Policy_Start_Date'])

df_train['Policy_Start_Date_Int'] = (df_train['Policy_Start_Date']-pd.Timestamp("2019-08-17")) // pd.Timedelta('1D')

# przeksztalcam date, na liczbe dni ktore minely od pierwszego rejestru (2019-08-17)

# Columns

In [51]:
id = 'id'
target = 'Premium_Amount'

num_cols = list(df_train.select_dtypes(include=['float','int']).columns)
num_cols.remove(id)
num_cols.remove(target)

onehot_cols = ['Gender', 'Marital_Status', 'Occupation','Location', 'Policy_Type', 'Smoking_Status', 'Property_Type']
ordinal_cols = ['Education_Level', 'Exercise_Frequency']
print(num_cols)

['Age', 'Annual_Income', 'Number_of_Dependents', 'Health_Score', 'Previous_Claims', 'Vehicle_Age', 'Credit_Score', 'Insurance_Duration']


# Pipelines

In [52]:
onehot_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ordinal', OneHotEncoder(handle_unknown='ignore'))
])
ordinal_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))

])
numerical_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipe, num_cols),
    ('onehot', onehot_pipe, onehot_cols),
    ('ordinal', ordinal_pipe, ordinal_cols)
]);

In [53]:
preprocessor.fit(df_train)