# Feature Engineering with Open-Source

In this notebook, we will reproduce the Feature Engineering Pipeline from the notebook 1 (01-data-analysis-feature-engineering), but we will replace, whenever possible, the manually created functions by open-source classes, and hopefully understand the value they bring forward.

# Reproducibility: Setting the seed

With the aim to ensure reproducibility between runs of the same notebook, but also between the research and production environment, for each step that includes some element of randomness, it is extremely important that we **set the seed**.

In [1]:
# to handle datasets
import pandas as pd
import numpy as np
from datetime import datetime
from collections import Counter
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

# for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display

# for the yeo-johnson transformation
import scipy.stats as stats

# from Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, Binarizer
from sklearn.preprocessing import Binarizer # for hugely skewed variables

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
)

from feature_engine.transformation import (
    LogTransformer,
    YeoJohnsonTransformer,
)

from feature_engine.selection import DropFeatures
from feature_engine.datetime import DatetimeFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# to save the trained scaler class
import joblib

# to build the model
from sklearn.linear_model import Lasso

# to evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_columns', None)
#pd.pandas.set_option('display.max_rows', None)

In [2]:
# load dataset
data = pd.read_csv("../Data/data.csv", parse_dates = ['Built_in_year'])

In [3]:
data.head(2)

Unnamed: 0,Ids,Location1,Location2,Type,Bedrooms,Bathrooms,Size_in_SqYds,Price_in_millions,Built_in_year,Parking_Spaces,Floors_in_Building,Elevators,Store_Rooms,Lobby_in_Building,Double_Glazed_Windows,Central_Air_Conditioning,Central_Heating,Waste_Disposal,Furnished,Service_Elevators_in_Building,Flooring,Electricity_Backup,Servant_Quarters,Study_Room,Prayer_Room,Powder_Room,Gym,Lounge_or_Sitting_Room,Laundry_Room,Business_Center_or_Media_Room_in_Building,Satellite_or_Cable_TV_Ready,Broadband_Internet_Access,Intercom,Conference_Room_in_Building,Community_Swimming_Pool,Community_Lawn_or_Garden,Community_Gym,Community_Center,First_Aid_or_Medical_Centre,Day_Care_center,Kids_Play_Area,Mosque,Barbeque_Area,Lawn_or_Garden,Swimming_Pool,Sauna,Jacuzzi,Nearby_Schools,Nearby_Hospital,Nearby_Shopping_Malls,Nearby_Restaurants,Nearby_Public_Transport_Service,Other_Nearby_Places,Security_Staff,Maintainance_Staff,Laundry_or_Dry_Cleaning_Facility,Facilities_for_Disabled
0,0,"DHA Phase 6, DHA Defence",DHA Defence,House,6.0,6.0,600.0,220.0,NaT,3,,,,0,1,1,1,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,"Saadi Town - Block 3, Saadi Town, Scheme 33",Scheme 33,House,6.0,6.0,240.0,38.5,NaT,1,,,,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Drop Columns and Rows

In [4]:
# drop id, it is just a number given to identify each house
# drop 'Built_in_year', 'Location2', 'Store_Rooms' columns

drop_features = DropFeatures(features_to_drop = ['Ids', 'Location2', 'Store_Rooms'])
data = drop_features.fit_transform(data)

In [5]:
# drop rows in col 'Price_in_millions' that have missing values
data.drop(data.loc[data['Price_in_millions'].isnull()].index, axis = 0, inplace = True)
print("Shape: ", data.shape)

Shape:  (14201, 54)


## Datetime Features

In [6]:
date_time_features = DatetimeFeatures(variables = ['Built_in_year'], 
                                      features_to_extract = ['year'], 
                                      drop_original = True, 
                                      missing_values = 'ignore')

data = date_time_features.fit_transform(data)

## Drop Duplicates

In [7]:
# Using DataFrame.drop_duplicates() to keep first duplicate row
data = data.drop_duplicates(keep = 'first')
# rows and columns of the data
print(f'Shape: {data.shape}')

Shape: (9819, 54)


In [8]:
# rows and columns of the data
print(f'Shape: {data.shape}')

# visualise the dataset
print('DataFrame: ')
print('-' * 10)
data.head(2)

Shape: (9819, 54)
DataFrame: 
----------


Unnamed: 0,Location1,Type,Bedrooms,Bathrooms,Size_in_SqYds,Price_in_millions,Parking_Spaces,Floors_in_Building,Elevators,Lobby_in_Building,Double_Glazed_Windows,Central_Air_Conditioning,Central_Heating,Waste_Disposal,Furnished,Service_Elevators_in_Building,Flooring,Electricity_Backup,Servant_Quarters,Study_Room,Prayer_Room,Powder_Room,Gym,Lounge_or_Sitting_Room,Laundry_Room,Business_Center_or_Media_Room_in_Building,Satellite_or_Cable_TV_Ready,Broadband_Internet_Access,Intercom,Conference_Room_in_Building,Community_Swimming_Pool,Community_Lawn_or_Garden,Community_Gym,Community_Center,First_Aid_or_Medical_Centre,Day_Care_center,Kids_Play_Area,Mosque,Barbeque_Area,Lawn_or_Garden,Swimming_Pool,Sauna,Jacuzzi,Nearby_Schools,Nearby_Hospital,Nearby_Shopping_Malls,Nearby_Restaurants,Nearby_Public_Transport_Service,Other_Nearby_Places,Security_Staff,Maintainance_Staff,Laundry_or_Dry_Cleaning_Facility,Facilities_for_Disabled,Built_in_year_year
0,"DHA Phase 6, DHA Defence",House,6.0,6.0,600.0,220.0,3,,,0,1,1,1,0,0,0,1,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,"Saadi Town - Block 3, Saadi Town, Scheme 33",House,6.0,6.0,240.0,38.5,1,,,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,


## Split the data into train and test

In [9]:
# Let's separate into train and test set
# Remember to set the seed (random_state for this sklearn function)
to_drop = ['Price_in_millions']
targe_var = 'Price_in_millions'

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(to_drop, axis = 1), # predictive variables
    data[targe_var], # targe_var, # target
    test_size = 0.1, # portion of dataset to allocate to test set
    random_state = 0, # we are setting the seed here
)

X_train.shape, X_test.shape

((8837, 53), (982, 53))

## 1. Target

Apply Log transformation to the target variable: 'Price_in_millions'.

In [10]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## 2. Missing Values

In [11]:
# to display all the columns of the dataframe in the notebook
pd.pandas.set_option('display.max_rows', None)
missing_data = pd.DataFrame(data.isnull().sum().sort_values(ascending = False), columns = ['Count'])
missing_data['Percent'] = (missing_data['Count'] / len(data)) * 100
# ingnoring the count which is 0 and display the result which has missing data
missing_data = missing_data[missing_data['Count'] > 0]
missing_data

Unnamed: 0,Count,Percent
Elevators,8628,87.870455
Floors_in_Building,8510,86.668704
Built_in_year_year,7451,75.883491
Bathrooms,942,9.593645
Bedrooms,722,7.353091
Size_in_SqYds,215,2.189632


### Impute missing values for Numerical variables

To engineer missing values in numerical variables, we will:

- add a binary missing indicator variable
- and then replace the missing values in the original variable with the mean

In [12]:
# let's identify the categorical variables we will capture those of type *object*
cat_vars = [var for var in data.columns if data[var].dtype == 'O']

In [13]:
# now let's identify the numerical variables
num_vars = [var for var in data.columns if var not in cat_vars and var != 'Price_in_millions']

# number of numerical variables
print(f"Total number of numerical variables: {len(num_vars)}")

Total number of numerical variables: 51


In [14]:
# make a list with the numerical variables that contain missing values
vars_with_na = [var for var in num_vars if data[var].isnull().sum() > 0]

# print percentage of missing values per variable
print(f"Numerical variables with missing values in percent:")
print("-" * 51)
print(data[vars_with_na].isnull().mean().sort_values(ascending = False))

Numerical variables with missing values in percent:
---------------------------------------------------
Elevators             0.878705
Floors_in_Building    0.866687
Built_in_year_year    0.758835
Bathrooms             0.095936
Bedrooms              0.073531
Size_in_SqYds         0.021896
dtype: float64


In [15]:
# Numerical variables with missing values
print('Numerical variables with missing values:')
print('-' * 40)
print(vars_with_na)

Numerical variables with missing values:
----------------------------------------
['Bedrooms', 'Bathrooms', 'Size_in_SqYds', 'Floors_in_Building', 'Elevators', 'Built_in_year_year']


In [16]:
vars_with_na_concatenated = [var + '_na' for var in vars_with_na]
print(vars_with_na_concatenated)

['Bedrooms_na', 'Bathrooms_na', 'Size_in_SqYds_na', 'Floors_in_Building_na', 'Elevators_na', 'Built_in_year_year_na']


In [17]:
# add missing indicator
if len(vars_with_na) > 0:

    missing_ind = AddMissingIndicator(variables = vars_with_na)
    
    missing_ind.fit(X_train)

    X_train = missing_ind.transform(X_train)
    X_test = missing_ind.transform(X_test)
    
else:
    print('No numerical variables with missing values')

In [18]:
# check the binary missing indicator variables in the train set
X_train[vars_with_na_concatenated].head()

Unnamed: 0,Bedrooms_na,Bathrooms_na,Size_in_SqYds_na,Floors_in_Building_na,Elevators_na,Built_in_year_year_na
4093,0,0,0,0,0,0
6926,0,0,0,1,1,0
4266,0,0,0,1,1,1
10058,0,0,0,1,1,1
11640,0,0,0,1,1,1


In [19]:
# check the binary missing indicator variables in the test set
X_test[vars_with_na_concatenated].head()

Unnamed: 0,Bedrooms_na,Bathrooms_na,Size_in_SqYds_na,Floors_in_Building_na,Elevators_na,Built_in_year_year_na
13793,0,0,0,1,1,1
11589,0,0,0,1,1,1
3549,0,0,0,1,1,1
13320,1,1,0,1,1,1
7658,1,1,0,1,1,1


In [20]:
# Now replace missing data with the mean

# set the imputer
mean_imputer = MeanMedianImputer(imputation_method = 'mean', variables = vars_with_na)

# learn and store parameters from train set
mean_imputer.fit(X_train)

# the stored parameters
mean_imputer.imputer_dict_

{'Bedrooms': 3.7471559633027525,
 'Bathrooms': 3.876346780255575,
 'Size_in_SqYds': 260.6207774178621,
 'Floors_in_Building': 11.084398976982097,
 'Elevators': 2.527001862197393,
 'Built_in_year_year': 2018.1284575714956}

In [21]:
X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

In [22]:
# IMPORTANT: note that we could save the imputers with joblib
# check that we have no more missing values in the engineered variables
X_train[vars_with_na].isnull().sum()

Bedrooms              0
Bathrooms             0
Size_in_SqYds         0
Floors_in_Building    0
Elevators             0
Built_in_year_year    0
dtype: int64

In [23]:
# IMPORTANT: note that we could save the imputers with joblib
# check that we have no more missing values in the engineered variables
X_test[vars_with_na].isnull().sum()

Bedrooms              0
Bathrooms             0
Size_in_SqYds         0
Floors_in_Building    0
Elevators             0
Built_in_year_year    0
dtype: int64

#### 'Year'/Time Related variables

In [24]:
# list of variables that contain year information
year_vars = [var for var in num_vars if 'Yr' in var or 'Built_in_year_year' in var]
year_vars

['Built_in_year_year']

In [25]:
current_year = datetime.now().year
print(f'Current Year: {current_year}')

Current Year: 2023


In [26]:
def elapsed_years(df, current_year):
    # capture difference between the year variable (i.e. when it was built) and the year now
    df['CurrYr'] = current_year
    df['Yrs_SinceBlt'] = df['CurrYr'] - df['Built_in_year_year']
    return df

X_train = elapsed_years(X_train, current_year)
X_test = elapsed_years(X_test, current_year)
#data.head(4)

In [27]:
#data.columns

In [28]:
# now we drop 'Year' and 'CurrYr'
drop_features = DropFeatures(features_to_drop = ['Built_in_year_year', 'CurrYr'])
X_train = drop_features.fit_transform(X_train)
X_test = drop_features.fit_transform(X_test)

### Impute missing values for Categorical variables

We will replace missing values with the string "missing" in those variables with a lot of missing data. 

Alternatively, we will replace missing data with the most frequent category in those variables that contain fewer observations without values. 

This is common practice.

Here, we don't have any categorical variable that have missing values, but for demonstration purpose we include the following code to show how to deal with if there is any

In [29]:
# let's identify the categorical variables we will capture those of type *object*
cat_vars = [var for var in data.columns if data[var].dtype == 'O']

# number of categorical variables
len(cat_vars)

# cast all variables as categorical
# data[cat_vars] = data[cat_vars].astype('O')

print('Number of categorical variables: ', len(cat_vars))
print('-' * 35)
print('')

print('List of categorical variables: ', cat_vars)
print('-' * 53)
print('')

# visualise the categorical variables
print(data[cat_vars].head())

Number of categorical variables:  2
-----------------------------------

List of categorical variables:  ['Location1', 'Type']
-----------------------------------------------------

                                           Location1   Type
0                           DHA Phase 6, DHA Defence  House
1        Saadi Town - Block 3, Saadi Town, Scheme 33  House
2     Bahria Town - Precinct 19, Bahria Town Karachi   Flat
3  Askari 5 - Sector J, Askari 5, Malir Cantonmen...  House
4  Bahria Town - Precinct 35, Bahria Sports City,...  House


In [30]:
# make a list of the categorical variables that contain missing values
cat_vars_with_na = [var for var in cat_vars if data[var].isnull().sum() > 0]

# print percentage of missing values per variable
print(f'Percentage of missing values per variable: {data[cat_vars_with_na ].isnull().mean().sort_values(ascending = False)}')

Percentage of missing values per variable: Series([], dtype: float64)


In [31]:
# variables to impute with the string missing
with_string_missing = [var for var in cat_vars_with_na if data[var].isnull().mean() > 0.1]

# variables to impute with the most frequent category
with_frequent_category = [var for var in cat_vars_with_na if data[var].isnull().mean() < 0.1]

In [32]:
print(f'Variables to impute with the string missing: {with_string_missing}')

Variables to impute with the string missing: []


In [33]:
print(f'Variables to impute with the most frequent category: {with_frequent_category}')

Variables to impute with the most frequent category: []


In [34]:
# replace missing values with new label: "Missing"

if len(with_string_missing) > 0:
    # replace missing values with new label: "Missing"

    # set up the class
    cat_imputer_missing = CategoricalImputer(
        imputation_method = 'missing', variables = with_string_missing)

    # fit the class to the train set
    cat_imputer_missing.fit(X_train)

    # the class learns and stores the parameters
    print("Cat Imputer Missing: ", cat_imputer_missing.imputer_dict_)
    
    # replace NA by missing
    # IMPORTANT: note that we could store this class with joblib
    X_train = cat_imputer_missing.transform(X_train)
    X_test = cat_imputer_missing.transform(X_test)
    
else:
    print('No categorical variables with missing values')
    

No categorical variables with missing values


In [35]:
# replace missing values with most frequent category
if len(with_frequent_category) > 0:
    # set up the class
    cat_imputer_frequent = CategoricalImputer(
        imputation_method = 'frequent', variables = with_frequent_category)

    # fit the class to the train set
    cat_imputer_frequent.fit(X_train)

    # the class learns and stores the parameters
    print("Cat Imputer Frequent: ", cat_imputer_frequent.imputer_dict_)
    
    # replace NA by missing
    # IMPORTANT: note that we could store this class with joblib
    X_train = cat_imputer_frequent.transform(X_train)
    X_test = cat_imputer_frequent.transform(X_test)
    
else:
    print('No categorical variables with missing values')

No categorical variables with missing values


In [36]:
# check that we have no missing information in the engineered variables
print('check that we have no missing information in the engineered variables of the train set')
print('-' * 69)
print(X_train[cat_vars_with_na].isnull().sum())

check that we have no missing information in the engineered variables of the train set
---------------------------------------------------------------------
Series([], dtype: float64)


In [37]:
# check that we have no missing information in the engineered variables
print('check that we have no missing information in the engineered variables of the test set')
print('-' * 69)
print(X_test[cat_vars_with_na].isnull().sum())

check that we have no missing information in the engineered variables of the test set
---------------------------------------------------------------------
Series([], dtype: float64)


In [38]:
# check absence of na in dataset
[var for var in X_train.columns if X_train[var].isnull().sum() > 0]

[]

In [39]:
# check absence of na in dataset
[var for var in X_test.columns if X_test[var].isnull().sum() > 0]

[]

## Convert Datatypes of Columns

In [40]:
#convert all float64 columns to int32/float32
# float_cols = ['Bedrooms', 'Bathrooms', 'Floors_in_Building', 'Elevators', 'Year']
float_cols = ['Bedrooms', 'Bathrooms', 'Floors_in_Building', 'Elevators', 'Yrs_SinceBlt']
X_train[float_cols] = X_train[float_cols].astype('int32')
X_test[float_cols] = X_test[float_cols].astype('int32')


int64_cols = data.select_dtypes(include = ['int64']).columns
X_train[int64_cols] = X_train[int64_cols].astype('int32')
X_test[int64_cols] = X_test[int64_cols].astype('int32')


float64_cols = ['Size_in_SqYds']
X_train[float64_cols] = X_train[float64_cols].astype('float32')
X_test[float64_cols] = X_test[float64_cols].astype('float32')


## Numerical variable transformation

### Logarithmic transformation

In [41]:
#X_train.columns

In [42]:
year_vars = [var for var in X_train.columns if 'Yrs_SinceBlt' in var]
year_vars

['Yrs_SinceBlt']

In [43]:
cat_vars

['Location1', 'Type']

In [44]:
num_vars = [
    var for var in X_train.columns if var not in cat_vars and (var != 'Price_in_millions' and var != 'log_price' and 'Yrs_SinceBlt' not in var)
]
#num_vars

In [45]:
#  let's male a list of discrete variables

discrete_vars = [var for var in num_vars if X_train[var].dtype != 'O' and 
                 X_train[var].dtype != 'float32' and 
                 X_train[var].dtype != 'float64'  and 
                 var not in year_vars
                ]
#discrete_vars

In [46]:
# make list of continuous variables
cont_vars = [
    var for var in num_vars if var not in discrete_vars + year_vars]

print('Number of continuous variables: ', len(cont_vars))
print('-' * 34)
print('Continuous variables: ', cont_vars)

Number of continuous variables:  1
----------------------------------
Continuous variables:  ['Size_in_SqYds']


In [47]:
log_transformer = LogTransformer(variables = cont_vars)

X_train = log_transformer.fit_transform(X_train)
X_test = log_transformer.transform(X_test)

## Check that dataset does not contain null values

In [48]:
# check that dataset does not contain null values in the train set
[var for var in cont_vars if X_train[var].isnull().sum() > 0]

[]

In [49]:
# check that dataset does not contain null values in the test set
[var for var in cont_vars if X_test[var].isnull().sum() > 0]

[]

## Categorical variables

### Encoding categorical variable 'Type'

In [50]:
# set up the encoder
type_vars = ['Type']

cat_encoder = OrdinalEncoder(encoding_method = 'ordered', variables = type_vars)

# create the mappings
cat_encoder.fit(X_train, y_train)

# mappings are stored and class can be saved
cat_encoder.encoder_dict_

{'Type': {'Flat': 0, 'House': 1}}

In [51]:
X_train = cat_encoder.transform(X_train)
X_test = cat_encoder.transform(X_test)

In [52]:
X_train.Type.unique(), X_test.Type.unique()

(array([0, 1], dtype=int64), array([0, 1], dtype=int64))

In [53]:
# # re-map strings to numbers

# type_mappings = {'Missing': 0, 'NA': 0, 'NaN': 0, 'Flat': 1, 'House': 2}

# type_vars = ['Type']

# for var in type_vars:
#     data[var] = data[var].map(type_mappings)

In [54]:
#data['Type'].unique()

### Removing Rare Labels

In [55]:
# let's again identify the categorical variables
# we will capture those of type *object*

cat_vars = [var for var in data.columns if data[var].dtype == 'O']

# capture the remaining categorical variables
# (those that we did not re-map)

cat_others = [
    var for var in cat_vars if var not in type_vars
]

print(len(cat_others))
print(cat_others)

1
['Location1']


In [56]:
# threshold_for_rare_labels = 0.0049
# threshold_for_rare_labels = 0.0059
# threshold_for_rare_labels = 0.0055
threshold_for_rare_labels = 0.01

rare_encoder = RareLabelEncoder(tol = threshold_for_rare_labels, n_categories = 1, variables = cat_others, replace_with = 'Rare')

# find common labels
rare_encoder.fit(X_train)

# the common labels are stored, we can save the class
# and then use it later :)
print(rare_encoder.encoder_dict_)
print(len(rare_encoder.encoder_dict_['Location1']))

{'Location1': ['DHA Phase 6, DHA Defence', 'DHA Phase 8, DHA Defence', 'Askari 5 - Sector J, Askari 5, Malir Cantonment, Cantt', 'Falcon Complex New Malir, Malir', 'DHA Phase 5, DHA Defence', 'Askari 5, Malir Cantonment, Cantt', 'DHA Phase 7, DHA Defence', 'Bahria Town Karachi', 'DHA Phase 4, DHA Defence', 'Navy Housing Scheme Karsaz', 'Bahria Apartments, Bahria Town Karachi']}
11


In [57]:
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

### Encoding of categorical variable 'Location1'

In [58]:
# set up the encoder
cat_encoder = OrdinalEncoder(encoding_method = 'ordered', variables = cat_others)

# create the mappings
cat_encoder.fit(X_train, y_train)

# mappings are stored and class can be saved
cat_encoder.encoder_dict_

{'Location1': {'Bahria Town Karachi': 0,
  'Bahria Apartments, Bahria Town Karachi': 1,
  'Rare': 2,
  'Askari 5, Malir Cantonment, Cantt': 3,
  'Askari 5 - Sector J, Askari 5, Malir Cantonment, Cantt': 4,
  'DHA Phase 5, DHA Defence': 5,
  'DHA Phase 4, DHA Defence': 6,
  'DHA Phase 7, DHA Defence': 7,
  'Falcon Complex New Malir, Malir': 8,
  'DHA Phase 6, DHA Defence': 9,
  'Navy Housing Scheme Karsaz': 10,
  'DHA Phase 8, DHA Defence': 11}}

In [59]:
X_train = cat_encoder.transform(X_train)
X_test = cat_encoder.transform(X_test)

## Feature Scaling

In [60]:
# create scaler
scaler = MinMaxScaler()

#  fit  the scaler to the train set
scaler.fit(X_train) 

# transform the train and test set

# sklearn returns numpy arrays, so we wrap the
# array with a pandas dataframe

X_train = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns = X_train.columns)

In [61]:
X_train.head()

Unnamed: 0,Location1,Type,Bedrooms,Bathrooms,Size_in_SqYds,Parking_Spaces,Floors_in_Building,Elevators,Lobby_in_Building,Double_Glazed_Windows,Central_Air_Conditioning,Central_Heating,Waste_Disposal,Furnished,Service_Elevators_in_Building,Flooring,Electricity_Backup,Servant_Quarters,Study_Room,Prayer_Room,Powder_Room,Gym,Lounge_or_Sitting_Room,Laundry_Room,Business_Center_or_Media_Room_in_Building,Satellite_or_Cable_TV_Ready,Broadband_Internet_Access,Intercom,Conference_Room_in_Building,Community_Swimming_Pool,Community_Lawn_or_Garden,Community_Gym,Community_Center,First_Aid_or_Medical_Centre,Day_Care_center,Kids_Play_Area,Mosque,Barbeque_Area,Lawn_or_Garden,Swimming_Pool,Sauna,Jacuzzi,Nearby_Schools,Nearby_Hospital,Nearby_Shopping_Malls,Nearby_Restaurants,Nearby_Public_Transport_Service,Other_Nearby_Places,Security_Staff,Maintainance_Staff,Laundry_or_Dry_Cleaning_Facility,Facilities_for_Disabled,Bedrooms_na,Bathrooms_na,Size_in_SqYds_na,Floors_in_Building_na,Elevators_na,Built_in_year_year_na,Yrs_SinceBlt
0,0.363636,0.0,0.2,0.333333,0.362916,0.01,0.208333,0.045455,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.125
1,0.545455,1.0,0.5,0.555556,0.3609,0.002,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.583333
2,0.181818,0.0,0.2,0.222222,0.290589,0.0,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1875
3,0.181818,1.0,0.4,0.444444,0.326943,0.0,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1875
4,0.181818,0.0,0.1,0.111111,0.088235,0.0,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1875


In [62]:
X_test.head()

Unnamed: 0,Location1,Type,Bedrooms,Bathrooms,Size_in_SqYds,Parking_Spaces,Floors_in_Building,Elevators,Lobby_in_Building,Double_Glazed_Windows,Central_Air_Conditioning,Central_Heating,Waste_Disposal,Furnished,Service_Elevators_in_Building,Flooring,Electricity_Backup,Servant_Quarters,Study_Room,Prayer_Room,Powder_Room,Gym,Lounge_or_Sitting_Room,Laundry_Room,Business_Center_or_Media_Room_in_Building,Satellite_or_Cable_TV_Ready,Broadband_Internet_Access,Intercom,Conference_Room_in_Building,Community_Swimming_Pool,Community_Lawn_or_Garden,Community_Gym,Community_Center,First_Aid_or_Medical_Centre,Day_Care_center,Kids_Play_Area,Mosque,Barbeque_Area,Lawn_or_Garden,Swimming_Pool,Sauna,Jacuzzi,Nearby_Schools,Nearby_Hospital,Nearby_Shopping_Malls,Nearby_Restaurants,Nearby_Public_Transport_Service,Other_Nearby_Places,Security_Staff,Maintainance_Staff,Laundry_or_Dry_Cleaning_Facility,Facilities_for_Disabled,Bedrooms_na,Bathrooms_na,Size_in_SqYds_na,Floors_in_Building_na,Elevators_na,Built_in_year_year_na,Yrs_SinceBlt
0,0.181818,0.0,0.1,0.111111,0.128161,0.0,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1875
1,0.181818,0.0,0.3,0.333333,0.329458,0.001,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1875
2,0.181818,0.0,0.3,0.333333,0.355216,0.0,0.229167,0.045455,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.1875
3,0.181818,0.0,0.2,0.222222,0.159759,0.0,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.1875
4,0.181818,1.0,0.2,0.222222,0.323739,0.0,0.229167,0.045455,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.1875


## Save to CSV

In [63]:
# let's now save the train and test sets for the next notebook!

X_train.to_csv('../Data/xtrain.csv', index = False)
X_test.to_csv('../Data/xtest.csv', index = False)

y_train.to_csv('../Data/ytrain.csv', index = False)
y_test.to_csv('../Data/ytest.csv', index = False)

## Save Scaler

In [64]:
# now let's save the scaler

joblib.dump(scaler, '../Models/minmax_scaler.joblib') 

['../Models/minmax_scaler.joblib']

That Concludes Data Analysis and Feature Engineering