# Imports

In [5]:
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df_train.columns = df_train.columns.str.replace(' ','_')
df_test.columns = df_test.columns.str.replace(' ','_')

# Columns

In [11]:
unknown_categories = ['Property_Type','Exercise_Frequency', 'Smoking_Status', 'Policy_Type', 'Location','Occupation', 'Education_Level', 'Marital_Status', 'Gender','Customer_Feedback'] # fill with 'unknown'
mean_columns = ['Credit_Score', 'Vehicle_Age', 'Health_Score', 'Annual_Income','Age'] # fill with mean
median_columns = ['Insurance_Duration', 'Previous_Claims', 'Insurance_Duration'] # fill with median
policy_date=['Policy_Start_Date_Int'] # policy date - fill with integer
id_col = 'id'
target_col = 'Premium_Amount'

# Date Managing

In [8]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

# Tworzenie niestandardowego transformatora
class DateToDaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, date_column, reference_date="2019-08-17", drop_original=True):
        self.date_column = date_column  # Nazwa kolumny z datami
        self.reference_date = pd.Timestamp(reference_date)  # Data referencyjna
        self.drop_original = drop_original  # Czy usuwać oryginalną kolumnę z datami

    def fit(self, X, y=None):
        return self  # Transformator nie wymaga dopasowania do danych

    def transform(self, X):
        # Upewniamy się, że dane są DataFrame'em
        X = X.copy()
        
        # Konwersja wskazanej kolumny na datetime
        X[self.date_column] = pd.to_datetime(X[self.date_column])
        
        # Obliczanie różnicy w dniach od daty referencyjnej
        X[self.date_column + "_days"] = (X[self.date_column] - self.reference_date) // pd.Timedelta('1D')
        
        # Opcjonalne usuwanie oryginalnej kolumny
        if self.drop_original:
            X.drop(columns=[self.date_column], inplace=True)
        
        return X


# Pipelines

In [9]:
unknown_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

mean_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

policy_date_pipeline = Pipeline([
    ('impute_dates', SimpleImputer(strategy='constant', fill_value=pd.Timestamp("2022-01-01"))),  
    ('date_to_days', DateToDaysTransformer(date_column="Policy_Start_Date", reference_date="2019-08-17")),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('unknown', unknown_pipeline, unknown_categories),
    ('mean', mean_pipeline, mean_columns),
    ('median', median_pipeline, median_columns),
    ('date', policy_date_pipeline, policy_date)
])

# Split Data

In [12]:
X = df_train.drop([id_col, target_col], axis=1)
y = df_train[target_col]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Hyperparameters Tuning

In [None]:
import optuna
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor