In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

<font size=6>
    Sberbank Russian Housing Market
</font>

<hr style="border: solid rgb(255,0,0) 0.0px; background-color: rgb(255,0,0);height: 2.0px;"/>
<font color='red' size=5>
    Summary: part of feature selection study (v. 0)
</font>
<hr style="border: solid rgb(255,0,0) 0.0px; background-color: rgb(255,0,0);height: 2.0px;"/>

Feature importance using the build-ins from a few different classifiers:
* RandomForestRegressor    
* GradientBoostingRegressor
* AdaBoostRegressor
* LGBMRegressor
* XGBRegressor

There are more methods from sklearn; see [feature-selection](https://scikit-learn.org/stable/modules/feature_selection.html#feature-selection)

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
import os

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import random
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, 20)]
random.shuffle(colors)

In [5]:
import importlib
import sys
sys.path.append('./helpers/')

In [9]:
%reset_selective sklearn

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


## Load the data

In [11]:
df = pd.read_csv('../../datasets/sberbank-russian-housing-market/train.csv', \
                 infer_datetime_format=True, parse_dates=['timestamp'])

In [12]:
float_cols = df.select_dtypes('float').columns.tolist()

In [13]:
int_cols = df.select_dtypes('int').columns.tolist()

In [17]:
num_cols = df.select_dtypes(['int', 'float']).columns.tolist()

In [14]:
string_cols = df.select_dtypes('object').columns.tolist()

## Categorical encoding

### Homebrew

In [2]:
import categorical_encoding
importlib.reload(categorical_encoding)
from categorical_encoding import get_cat_encoding

ModuleNotFoundError: No module named 'categorical_encoding'

In [324]:
df_string_cols = df[['id'] + string_cols].copy()

In [326]:
df_string_cols = get_cat_encoding(df_string_cols, string_cols)

In [327]:
df = df.drop(columns=yes_no_cols).merge(df_string_cols, on='id', how='inner')

### Pipelining

**Strategy**

Create pipelines for the categorical and numerical features 

1. using `ColumnTransformer`, `FunctionTransformer` and `Pipeline`
    * allows individual feature selections

```
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

housing_num_tr = num_pipeline.fit_transform(housing_num)


from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", OneHotEncoder(), cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)
```

or 

2. using `Pipelines`
    * selection is done inside custom methods
    
    ```
        #Categrical features to pass down the categorical pipeline 
        cat_features = ['date', 'waterfront', 'view', 'yr_renovated']

        #Numerical features to pass down the numerical pipeline 
        num_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                        'condition', 'grade', 'sqft_basement', 'yr_built']

        #Defining the steps in the categorical pipeline 
        cat_pipeline = Pipeline( 
                                steps = [ ( 'cat_selector', FeatureSelector(cat_features) ),
                                          ( 'cat_transformer', CategoricalTransformer() ), 
                                          ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )

        #Defining the steps in the numerical pipeline     
        num_pipeline = Pipeline( 
                                steps = [ ( 'num_selector', FeatureSelector(num_features) ),
                                          ( 'num_transformer', NumericalTransformer() ),
                                          ('imputer', SimpleImputer(strategy = 'median') ),
                                          ( 'std_scaler', StandardScaler() ) 
                                          ] )

        #Combining numerical and categorical piepline into one full big pipeline horizontally 
        #using FeatureUnion
        full_pipeline = FeatureUnion( transformer_list = [ ( 'cat_pipeline', cat_pipeline ), 

                                                          ( 'num_pipeline', num_pipeline ) ] )        
    ```
    
    * **note**  _all_ features returned by `FeatureSelector` are passed into the transformers since pipelines can only deal with the whole array passed

In [15]:
print(string_cols)

['product_type', 'sub_area', 'culture_objects_top_25', 'thermal_power_plant_raion', 'incineration_raion', 'oil_chemistry_raion', 'radiation_raion', 'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion', 'detention_facility_raion', 'water_1line', 'big_road1_1line', 'railroad_1line', 'ecology']


In [16]:
df[string_cols].head()

Unnamed: 0,product_type,sub_area,culture_objects_top_25,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,ecology
0,Investment,Bibirevo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,Investment,Nagatinskij Zaton,yes,no,no,no,no,no,no,no,no,no,no,no,excellent
2,Investment,Tekstil'shhiki,no,no,no,no,yes,no,no,no,no,no,no,no,poor
3,Investment,Mitino,no,no,no,no,no,no,no,no,no,no,no,no,good
4,Investment,Basmannoe,no,no,no,no,yes,yes,no,no,no,no,no,yes,excellent


**Note**
* Ordinals:
    * 'ecology'
* OneHots
    * 'sub_area', 'product_type' (but only two vals)
* Binaries
    * the rest
    * _But these will also be considered ordinals_

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

In [18]:
from sklearn.preprocessing import OrdinalEncoder

df_ecology = df[["ecology"]]
df_ecology.head(10)

Unnamed: 0,ecology
0,good
1,excellent
2,poor
3,good
4,excellent
5,poor
6,poor
7,good
8,poor
9,satisfactory


In [None]:
categ_list = [['Investment', 'OwnerOccupier']] + [df.sub_area.unique().tolist()] + 12*[['no','yes']]+[['poor', 'satisfactory', 'good', 'excellent', 'no data']]

enc = OrdinalEncoder(categ_list)

str_enc_data = enc.fit_transform(df[cols_to_encode].values)

# 5. rewrite encoded to the data
df[cols_to_encode] = str_enc_data


# **Note** refill any NANs wished to be retained
#TODO move to separate function
encoder_max = df.ecology.max()
df['ecology'] = np.where(df.ecology==df.ecology.max(), np.NaN, df.ecology)

# 6. Recast the data, if needed

df[cols_to_encode[:-1]] = df[cols_to_encode[:-1]].astype('int8')

In [20]:
ord4_cats = [['poor', 'satisfactory', 'good', 'excellent', 'no data']]
ord4_enc = OrdinalEncoder(ord4_cats)
df_ecology = df[["ecology"]]
df_ecology = ord4_enc.fit_transform(df[["ecology"]])
df_ecology[:10]

array([[2.],
       [3.],
       [0.],
       [2.],
       [3.],
       [0.],
       [0.],
       [2.],
       [0.],
       [1.]])

In [28]:
from sklearn.preprocessing import OneHotEncoder

onehot_cat_cols = ["sub_area", "product_type"]

# By default, the OneHotEncoder class returns a sparse array, but we can convert it to a dense array if needed by calling the toarray() method:
# Alternatively, you can set sparse=False when creating the OneHotEncoder:
onehots = OneHotEncoder(sparse=False)
housing_cat_1hot = onehots.fit_transform(df[onehot_cat_cols])
housing_cat_1hot[:10]

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [219]:
df_test = df['ecology'].sample(100, ).reset_index(drop=True)

df_test.head()

0    excellent
1    excellent
2         poor
3         poor
4         good
Name: ecology, dtype: object

In [235]:
def mapping_encoder(mapping):
    
    map_func = lambda x: mapping[x]
    
    def inner(data):
        
        output = np.array(list(map(map_func, data)))
        
        return output
    
    return inner
        

ecology_map = dict(zip(['good', 'excellent', 'poor', 'satisfactory', 'no data'],[2,3,0,1,4]))

ecology_nanmap = dict(zip(range(5),list(range(4))+[np.nan]))

encoder = lambda x: mapping_encoder(ecology_nanmap)( mapping_encoder(ecology_map)(x) )

In [267]:
test_arr = np.array(['excellent', 'poor',  'good', 'no data', 'satisfactory'], dtype=object)
result_arr = np.array([ 3.,  0.,  2.,   np.nan,  1.], dtype=np.float64)

if not all(result_arr == encoder(test_arr)):
    # np.Nans aren't equivalent?
    print(test_arr)
    print(result_arr)    
    print(encoder(test_arr))
    print(result_arr == encoder(test_arr))
else:
    print(True)

['excellent' 'poor' 'good' 'no data' 'satisfactory']
[ 3.  0.  2. nan  1.]
[ 3.  0.  2. nan  1.]
[ True  True  True False  True]


In [238]:
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(encoder)

In [239]:
transformer.fit_transform(df['ecology'])

array([ 2.,  3.,  0., ..., nan,  1.,  0.])

In [344]:
class OrdinalEncoderNans(BaseEstimator, TransformerMixin):
    
    #Class Constructor
    def __init__( self, _cats_to_map):
        self._cats_to_map = _cats_to_map
        
    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
            
        # Map all but the last the categories to ints
        X = X.replace( self._cats_to_map[:-1], np.arange(len(self._cats_to_map[:-1])))
                    
        #Converting any infinity values in the dataset to Nan
        X = X.replace( self._cats_to_map[-1], np.nan )
        
        shape_ = X.shape[0]
        #returns a numpy array
        return X.values

In [345]:
ord4_enc_nan = OrdinalEncoderNans(ord4_cats)
housing_extra_attribs = ord4_enc_nan.fit_transform(df[ord4_cat_cols])

In [346]:
test_shape = housing_extra_attribs.shape[0]
housing_extra_attribs.reshape(test_shape,)
test_shape, housing_extra_attribs.shape, housing_extra_attribs

(30471, (30471, 1), array([[ 2.],
        [ 3.],
        [ 0.],
        ...,
        [nan],
        [ 1.],
        [ 0.]]))

In [69]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

df_num_tr = num_pipeline.fit_transform(df[num_cols])

#### putting the cat-encs together into the pipeline

In [301]:
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(encoder)

onehot_cat_cols = ["sub_area", "product_type"]

ord4_cat_cols = ["ecology"]
ord4_cats = ['poor', 'satisfactory', 'good', 'excellent', 'no data']

ord2_cat_cols = [e for e in string_cols if (e not in onehot_cat_cols) & (e not in ord4_cat_cols) ]
ord2_cats = 12*[["no", "yes"]]

full_pipeline = ColumnTransformer([
        ("num_pipe", num_pipeline, num_cols),
        ("onehots", OneHotEncoder(sparse=False), onehot_cat_cols),
        ("ordinals2", OrdinalEncoder(ord2_cats), ord2_cat_cols),   
        ("ordinals4", OrdinalEncoderNans(ord4_cats), ord4_cat_cols),    
    ])

df_cat_prepared = full_pipeline.fit_transform(df)

In [303]:
df_cat_prepared.shape

(30471, 437)

### From external source example

In [None]:
import numpy as np 
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline

#Custom Transformer that extracts columns passed as argument to its constructor 
class FeatureSelector( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, feature_names ):
        self._feature_names = feature_names 
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X[ self._feature_names ] 

#Custom transformer that breaks dates column into year, month and day into separate columns and
#converts certain features to binary 
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self, use_dates = ['year', 'month', 'day'] ):
        self._use_dates = use_dates
        
    #Return self nothing else to do here
    def fit( self, X, y = None  ):
        return self

    #Helper function to extract year from column 'dates' 
    def get_year( self, obj ):
        return str(obj)[:4]
    
    #Helper function to extract month from column 'dates'
    def get_month( self, obj ):
        return str(obj)[4:6]
    
    #Helper function to extract day from column 'dates'
    def get_day(self, obj):
        return str(obj)[6:8]
    
    #Helper function that converts values to Binary depending on input 
    def create_binary(self, obj):
        if obj == 0:
            return 'No'
        else:
            return 'Yes'
    
    #Transformer method we wrote for this transformer 
    def transform(self, X , y = None ):
        #Depending on constructor argument break dates column into specified units
        #using the helper functions written above 
        for spec in self._use_dates:

        exec( "X.loc[:,'{}'] = X['date'].apply(self.get_{})".format( spec, spec ) )
        #Drop unusable column 
        X = X.drop('date', axis = 1 )

        #Convert these columns to binary for one-hot-encoding later
        X.loc[:,'waterfront'] = X['waterfront'].apply( self.create_binary )

        X.loc[:,'view'] = X['view'].apply( self.create_binary )

        X.loc[:,'yr_renovated'] = X['yr_renovated'].apply( self.create_binary )
        #returns numpy array
        return X.values 

#Custom transformer we wrote to engineer features ( bathrooms per bedroom and/or how old the house is in 2019  ) 
#passed as boolen arguements to its constructor
class NumericalTransformer(BaseEstimator, TransformerMixin):
    #Class Constructor
    def __init__( self, bath_per_bed = True, years_old = True ):
        self._bath_per_bed = bath_per_bed
        self._years_old = years_old
        
    #Return self, nothing else to do here
    def fit( self, X, y = None ):
        return self 
    
    #Custom transform method we wrote that creates aformentioned features and drops redundant ones 
    def transform(self, X, y = None):
        #Check if needed 
        if self._bath_per_bed:
            #create new column
            X.loc[:,'bath_per_bed'] = X['bathrooms'] / X['bedrooms']
            #drop redundant column
            X.drop('bathrooms', axis = 1 )
        #Check if needed     
        if self._years_old:
            #create new column
            X.loc[:,'years_old'] =  2019 - X['yr_built']
            #drop redundant column 
            X.drop('yr_built', axis = 1)
            
        #Converting any infinity values in the dataset to Nan
        X = X.replace( [ np.inf, -np.inf ], np.nan )
        #returns a numpy array
        return X.values

#Categrical features to pass down the categorical pipeline 
cateforical_features = ['date', 'waterfront', 'view', 'yr_renovated']

#Numerical features to pass down the numerical pipeline 
numerical_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
                'condition', 'grade', 'sqft_basement', 'yr_built']

#Defining the steps in the categorical pipeline 
categorical_pipeline = Pipeline( steps = [ ( 'cat_selector', FeatureSelector(categorical_features) ),
                                  
                                  ( 'cat_transformer', CategoricalTransformer() ), 
                                  
                                  ( 'one_hot_encoder', OneHotEncoder( sparse = False ) ) ] )
    
#Defining the steps in the numerical pipeline     
numerical_pipeline = Pipeline( steps = [ ( 'num_selector', FeatureSelector(numerical_features) ),
                                  
                                  ( 'num_transformer', NumericalTransformer() ),
                                  
                                  ('imputer', SimpleImputer(strategy = 'median') ),
                                  
                                  ( 'std_scaler', StandardScaler() ) ] )

#Combining numerical and categorical piepline into one full big pipeline horizontally 
#using FeatureUnion
full_pipeline = FeatureUnion( transformer_list = [ ( 'categorical_pipeline', categorical_pipeline ), 
                                                  
                                                  ( 'numerical_pipeline', numerical_pipeline ) ] )

In [None]:
def get_cat_encoding(df, cols_to_encode):
    
    # 1. Copy the data
    # 2. ~~data inspection~~
    #    * deep-dive into some cols
    # 3. ~~select the cols to be transformed~~ --> given
    # 4. Get the encoder; specify the data to encode and transform
    # **Note** this list is specific to this data but it could be passed as argument
    #TODO move to separate function
    categ_list = [['Investment', 'OwnerOccupier']] + [df.sub_area.unique().tolist()] + 12*[['no','yes']]+[['poor', 'satisfactory', 'good', 'excellent', 'no data']]
    
    enc = OrdinalEncoder(categ_list)

    str_enc_data = enc.fit_transform(df[cols_to_encode].values)

    # 5. rewrite encoded to the data
    df[cols_to_encode] = str_enc_data


    # **Note** refill any NANs wished to be retained
    #TODO move to separate function
    encoder_max = df.ecology.max()
    df['ecology'] = np.where(df.ecology==df.ecology.max(), np.NaN, df.ecology)

    # 6. Recast the data, if needed

    df[cols_to_encode[:-1]] = df[cols_to_encode[:-1]].astype('int8')

    # 7. Final check on the data: does it contain all the rows as before?
    # 8. Replace the encoded data (excluded here)

    return df