In [2]:
from utils.transformations import ExtendedTransformation, SimpleTransformation
from utils.filters import SimpleFilter
import pandas as pd
import numpy as np

## Data Loading
Load and prepare the training data:
- Read preprocessed train data from CSV
- Split features (X_train) and target variable (y_train)

In [3]:
df_train = pd.read_csv("data/preprocessed/train_data.csv")
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]

## Initialize Preprocessing Components
Set up preprocessing components:
- ExtendedTransformation for complex feature engineering
- SimpleFilter for feature selection

In [4]:
preprocessor = ExtendedTransformation()
filter = SimpleFilter()

## Fit Preprocessor
Fit the preprocessor to learn data characteristics:
- Analyzes feature distributions and relationships
- Prepares for transformation

In [5]:
preprocessor.fit(X_train, y_train)

X shape:  (20974, 40)
bin_vars_columns shape:  (36,)
low_card_columns shape:  37


## Apply Data Transformation
Execute the preprocessing pipeline:
- Transforms raw features into engineered features
- Includes feature scaling, encoding, and feature crossing
- Outputs processed features (X_processed) and target (y_processed)

In [6]:
X_processed, y_processed = preprocessor.transform(X_train, y_train)

X shape:  (20974, 40)
X_low_card   shape:  (20974, 113)
X_high_card shape:  (20974, 50)
X_crossed_features shape:  (20974, 6670)
X_EXPANDED shape:  (20974, 6835)


## Examine Processed Data
Display the first few rows of the processed test data:
- Shows the structure and values of engineered features

In [7]:
X_processed.head()

Unnamed: 0,Area,No. of Bedrooms,city_Bangalore,city_Chennai,city_Delhi,city_Hyderabad,city_Kolkata,city_Mumbai,Resale_NO,Resale_SI,...,Stadium_NO Stadium_NO_DISPONIBLE,Stadium_NO Stadium_SI,Stadium_NO Area,Stadium_NO No. of Bedrooms,Stadium_NO_DISPONIBLE Stadium_SI,Stadium_NO_DISPONIBLE Area,Stadium_NO_DISPONIBLE No. of Bedrooms,Stadium_SI Area,Stadium_SI No. of Bedrooms,Area No. of Bedrooms
0,0.702132,-0.552619,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,-0.0,0.0,0.0,-0.0,0.702132,-0.552619,-0.388011
1,0.994486,0.896197,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.994486,0.896197,0.0,0.0,0.891256
2,-0.415081,0.896197,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,-0.0,0.0,0.0,-0.415081,0.896197,-0.0,0.0,-0.371995
3,-1.33391,-0.552619,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,-1.33391,-0.552619,0.0,-0.0,-0.0,-0.0,-0.0,0.737144
4,0.308845,0.896197,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.308845,0.896197,0.0,0.0,0.0,0.0,0.0,0.276786


## Feature Selection
Apply feature selection using SimpleFilter:
- Fits the filter to the processed data
- Identifies most relevant features for the model

In [8]:
filter.fit(X_processed, y_processed)

(20974, 6835)
(20974, 4173)
(20974, 3193)
(20974, 1635)


In [9]:
X_filtered, y_filtered = filter.transform(X_processed, y_processed)

(20974, 4173)
(20974, 3193)
(20974, 1635)


In [15]:
y_filtered.head()

AttributeError: 'numpy.ndarray' object has no attribute 'head'

In [12]:
X_filtered.head()

Unnamed: 0,Area,No. of Bedrooms,city_Bangalore,city_Chennai,city_Delhi,city_Hyderabad,city_Kolkata,city_Mumbai,Resale_NO,MaintenanceStaff_NO,...,DiningTable_SI Stadium_NO,DiningTable_SI Stadium_SI,DiningTable_SI Area,DiningTable_SI No. of Bedrooms,Sofa_NO Stadium_SI,Sofa_SI Wardrobe_SI,Sofa_SI Stadium_NO,Sofa_SI No. of Bedrooms,Wardrobe_SI Area,Area No. of Bedrooms
0,0.702132,-0.552619,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.702132,-0.552619,0.0,0.0,0.0,-0.552619,0.0,-0.388011
1,0.994486,0.896197,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.891256
2,-0.415081,0.896197,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,-0.0,-0.371995
3,-1.33391,-0.552619,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,0.737144
4,0.308845,0.896197,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276786


## Save to file
Save train data for posterior reuse

In [18]:
y_filtered_df = pd.DataFrame({"y":y_filtered.flatten()}, index=X_filtered.index)
DIR = "data/postproccessed/"
X_filtered.to_pickle(DIR +'X_train_pipe_1.pkl')
y_filtered_df.to_pickle(DIR +'y_train_pipe_1.pkl')

## Load Test Dataset
Prepare the test dataset:
- Read from CSV file
- Split features (X_test) and target variable (y_test)

In [22]:
df_test = pd.read_csv("data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]

## Process Test Data
Apply the same preprocessing pipeline to test data:
- Transform features using preprocessor
- Apply feature selection using filter
- Output processed and filtered test data

In [23]:
X_test_proccesed, y_test_proccessed = preprocessor.transform(X_test, y_test)
X_test_filtered, y_test_filtered = filter.transform(X_test_proccesed, y_test_proccessed)

X shape:  (8989, 40)
X_low_card   shape:  (8989, 113)
X_high_card shape:  (8989, 50)
X_crossed_features shape:  (8989, 6670)
X_EXPANDED shape:  (8989, 6835)
(8989, 4173)
(8989, 3193)
(8989, 1635)


## Save to file
Save train data for posterior reuse

In [24]:
y_test_filtered_df = pd.DataFrame({"y":y_test_filtered.flatten()}, index=X_test_filtered.index)
DIR = "data/postproccessed/"
X_test_filtered.to_pickle(DIR +'X_test_pipe_1.pkl')
y_test_filtered_df.to_pickle(DIR +'y_test_pipe_1.pkl')