# Imports

In [54]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_log_error
import pandas as pd

df_train = pd.read_csv("./train.csv")
df_test = pd.read_csv("./test.csv")
df_train.columns = df_train.columns.str.replace(' ','_')
df_test.columns = df_test.columns.str.replace(' ','_')

# Columns

In [59]:
unknown_categories = ['Property_Type','Exercise_Frequency', 'Smoking_Status', 'Policy_Type', 'Location','Occupation', 'Education_Level', 'Marital_Status', 'Gender','Customer_Feedback'] # fill with 'unknown'
mean_columns = ['Credit_Score', 'Vehicle_Age', 'Health_Score', 'Annual_Income','Age'] # fill with mean
median_columns = ['Insurance_Duration', 'Previous_Claims', 'Insurance_Duration'] # fill with median
policy_date=['Policy_Start_Date'] # policy date - delete column

X = df_train.drop(columns='Premium_Amount')

# Date Managing

In [None]:
from sklearn.preprocessing import FunctionTransformer

def drop_column(X, column_to_drop):

    X = X.copy()  # Unikamy modyfikacji oryginalnego DataFrame
    return X.drop(columns=[column_to_drop], errors='ignore')


# Pipelines

In [60]:
unknown_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

mean_pipeline = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

median_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

policy_date_pipeline = Pipeline([
    ('drop_column', FunctionTransformer(drop_column, kw_args={'column_to_drop': 'Policy_Start_Date'}))

])

preprocessor = ColumnTransformer([
    ('unknown', unknown_pipeline, unknown_categories),
    ('mean', mean_pipeline, mean_columns),
    ('median', median_pipeline, median_columns),
    ('date', policy_date_pipeline, policy_date)
])

# Apply Pipelines

In [66]:
preprocessor.fit(df_train)

df_train_transformed = preprocessor.transform(df_train)

# Split Data

In [None]:
X = df_train.drop([id_col, target_col], axis=1)
y = df_train[target_col]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
