## Import libraries

In [None]:
import os
os.chdir('/workspaces/automl/src')

In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from sklearn.datasets import fetch_california_housing


## Preprocessing

In [None]:
from automl.feature_selection.pipe import PreprocessingPipeline, ValTestsPipeline
from automl.utils.utils import split_data

In [None]:
target_col = 'MedHouseVal'
test_size = 0.2
random_state = 42
val_split_col = 'is_test_for_val'

In [None]:
california_data = fetch_california_housing(as_frame=True)
df = pd.concat([california_data.data, pd.DataFrame(california_data.target)], axis=1)

In [None]:
df.head()

In [None]:
train, test = split_data(df, target_col, test_size=test_size, random_state=random_state)
X_train, y_train = train.drop(target_col, axis=1), train[target_col]
X_test, y_test = test.drop(target_col, axis=1), test[target_col]

In [None]:
preprocessing_pipe = PreprocessingPipeline()

In [None]:
preprocessing_pipe.fit(X_train, y_train)
X_train_prep = preprocessing_pipe.transform(X_train)
X_train_prep.columns = [col.lstrip('remainder__') for col in X_train_prep.columns]
X_test_prep = preprocessing_pipe.transform(X_test)
X_test_prep.columns = [col.lstrip('remainder__') for col in X_test_prep.columns]
print(X_train_prep.shape, X_test_prep.shape)

In [None]:
val_tests_pipe = ValTestsPipeline()
X_train_prep[val_split_col] = 0
X_test_prep[val_split_col] = 1
X_for_val_tests = pd.concat([X_train_prep, X_test_prep], axis=0)

In [None]:
val_tests_pipe.fit(X_for_val_tests)

X_train_after_val = val_tests_pipe.transform(X_train_prep).drop(val_split_col, axis=1)
X_test_after_val = val_tests_pipe.transform(X_test_prep).drop(val_split_col, axis=1)
print(X_train_after_val.shape, X_test_after_val.shape)

In [None]:
train_before_fs = X_train_after_val.join(y_train)
test_before_fs = X_test_after_val.join(y_test)

## Feature Selection

In [None]:
from automl.feature_selection.transformers import FeatureSelectionTransformer

In [None]:
fst_rfa = FeatureSelectionTransformer(task_type='reg', target_colname=target_col, 
                                      metric_name='mae', metric_direction='minimize', timeout=120, random_state=42,
                                      strategy='RFA')

In [None]:
fst_rfa.fit(train_before_fs, test_before_fs)

In [None]:
fst_rfe = FeatureSelectionTransformer(task_type='reg', target_colname=target_col, 
                                      metric_name='mae', metric_direction='minimize', timeout=120, random_state=42,
                                      model='lama', strategy='RFE')

In [None]:
fst_rfe.fit(train_before_fs, test_before_fs)

In [None]:
fst_pfi = FeatureSelectionTransformer(task_type='reg', target_colname=target_col, 
                                      metric_name='mae', metric_direction='minimize', timeout=120, random_state=42,
                                      model='lama', strategy='PFI')

In [None]:
fst_pfi.fit(train_before_fs, test_before_fs)

In [None]:
train_final = fst_rfa.transform(train_before_fs)
test_final = fst_rfa.transform(test_before_fs)

In [None]:
model = fst_rfa.train_lama_model(train_final, test_final)

In [None]:
metric_train, metric_test = fst_rfa.calc_metric(train_final, test_final)

In [None]:
print(f'Метрика train: {metric_train}\n Метрика test: {metric_test}')