In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

<font size=6>
    Sberbank Russian Housing Market
</font>

<hr style="border: solid rgb(255,0,0) 0.0px; background-color: rgb(255,0,0);height: 2.0px;"/>
<font color='red' size=5>
    Summary: Feature Engineering with categorical data
    
</font>
<hr style="border: solid rgb(255,0,0) 0.0px; background-color: rgb(255,0,0);height: 2.0px;"/>

1. The categorical columns are inspected.
    * 15 cols
    * `product_type` contains two non-ordinal labels
        * `'Investment', 'OwnerOccupier'`
    * `sub_area` contains 146 non-ordinal labels
        * regions in the city
    * `ecology` contiain 4 ordinal labels and 1 'no data'
        * 'no data' $\to$ `np.NaN` but _after_ encoding
            * `OrdinalEncoder` fails with NANs
    * the rest are "yes/no" labels
2. All columns are are converted to ordinal arrays
    * "yes/no"$\to$"1/0", 
    * `ecology = ['poor'...'excellent']` $\to$ `[0,...,3]` 
    * the rest are arbitrary

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('../../datasets/sberbank-russian-housing-market/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../../datasets/sberbank-russian-housing-market/sample_submission.csv
../../datasets/sberbank-russian-housing-market/macro.csv
../../datasets/sberbank-russian-housing-market/test.csv
../../datasets/sberbank-russian-housing-market/data_dictionary.txt
../../datasets/sberbank-russian-housing-market/train.csv
../../datasets/sberbank-russian-housing-market/sberbank-russian-housing-market.zip
../../datasets/sberbank-russian-housing-market/__MACOSX/._test.csv
../../datasets/sberbank-russian-housing-market/__MACOSX/._train.csv


## Load the data

In [3]:
df = pd.read_csv('../../datasets/sberbank-russian-housing-market/train.csv', \
                 infer_datetime_format=True, parse_dates=['timestamp'])

In [4]:
df.shape

(30471, 292)

In [29]:
string_cols = df.select_dtypes('object').columns.tolist()

In [30]:
['id'] + string_cols

['id',
 'product_type',
 'sub_area',
 'culture_objects_top_25',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'water_1line',
 'big_road1_1line',
 'railroad_1line',
 'ecology']

#### Encoding

In [13]:
import importlib
import os, sys

In [14]:
sys.path.append('./helpers/')

In [42]:
import categorical_encoding

importlib.reload(categorical_encoding)

from categorical_encoding import get_cat_encoding;

1. Copy the data

In [43]:
df_string_cols = df[['id'] + string_cols].copy()

2. data inspection

In [44]:
df_string_cols.head()

Unnamed: 0,id,product_type,sub_area,culture_objects_top_25,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,ecology
0,1,Investment,Bibirevo,no,no,no,no,no,no,no,no,no,no,no,no,good
1,2,Investment,Nagatinskij Zaton,yes,no,no,no,no,no,no,no,no,no,no,no,excellent
2,3,Investment,Tekstil'shhiki,no,no,no,no,yes,no,no,no,no,no,no,no,poor
3,4,Investment,Mitino,no,no,no,no,no,no,no,no,no,no,no,no,good
4,5,Investment,Basmannoe,no,no,no,no,yes,yes,no,no,no,no,no,yes,excellent


In [45]:
df_string_cols.agg(['count', 'nunique'])

Unnamed: 0,id,product_type,sub_area,culture_objects_top_25,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,ecology
count,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471,30471
nunique,30471,2,146,2,2,2,2,2,2,2,2,2,2,2,2,5


   * deep-dive into some cols

In [46]:
df_string_cols['ecology'].unique()

array(['good', 'excellent', 'poor', 'satisfactory', 'no data'],
      dtype=object)

In [47]:
df_string_cols.sub_area.unique()

array(['Bibirevo', 'Nagatinskij Zaton', "Tekstil'shhiki", 'Mitino',
       'Basmannoe', 'Nizhegorodskoe', "Sokol'niki", 'Koptevo', 'Kuncevo',
       'Kosino-Uhtomskoe', 'Zapadnoe Degunino', 'Presnenskoe',
       'Lefortovo', "Mar'ino", "Kuz'minki", 'Nagornoe', "Gol'janovo",
       'Vnukovo', 'Juzhnoe Tushino', 'Severnoe Tushino',
       "Chertanovo Central'noe", 'Fili Davydkovo', 'Otradnoe',
       'Novo-Peredelkino', 'Bogorodskoe', 'Jaroslavskoe', 'Strogino',
       'Hovrino', "Moskvorech'e-Saburovo", 'Staroe Krjukovo', 'Ljublino',
       'Caricyno', 'Veshnjaki', 'Danilovskoe', 'Preobrazhenskoe',
       "Kon'kovo", 'Brateevo', 'Vostochnoe Izmajlovo', 'Vyhino-Zhulebino',
       'Donskoe', 'Novogireevo', 'Juzhnoe Butovo', 'Sokol', 'Kurkino',
       'Izmajlovo', 'Severnoe Medvedkovo', 'Rostokino',
       'Orehovo-Borisovo Severnoe', 'Ochakovo-Matveevskoe', 'Taganskoe',
       'Dmitrovskoe', 'Orehovo-Borisovo Juzhnoe', 'Teplyj Stan',
       'Babushkinskoe', 'Pokrovskoe Streshnevo', 'Obruc

3. select the cols to be transformed

In [48]:
cols_to_encode = [col for col in string_cols]# if (col != 'sub_area')]

In [49]:
cols_to_encode

['product_type',
 'sub_area',
 'culture_objects_top_25',
 'thermal_power_plant_raion',
 'incineration_raion',
 'oil_chemistry_raion',
 'radiation_raion',
 'railroad_terminal_raion',
 'big_market_raion',
 'nuclear_reactor_raion',
 'detention_facility_raion',
 'water_1line',
 'big_road1_1line',
 'railroad_1line',
 'ecology']

4. Get the encoder; specify the data to encode and transform

In [50]:
df_new = get_cat_encoding(df_string_cols, cols_to_encode)

In [51]:
df_new.dtypes

id                             int64
product_type                    int8
sub_area                        int8
culture_objects_top_25          int8
thermal_power_plant_raion       int8
incineration_raion              int8
oil_chemistry_raion             int8
radiation_raion                 int8
railroad_terminal_raion         int8
big_market_raion                int8
nuclear_reactor_raion           int8
detention_facility_raion        int8
water_1line                     int8
big_road1_1line                 int8
railroad_1line                  int8
ecology                      float64
dtype: object

In [52]:
df_new.head()

Unnamed: 0,id,product_type,sub_area,culture_objects_top_25,thermal_power_plant_raion,incineration_raion,oil_chemistry_raion,radiation_raion,railroad_terminal_raion,big_market_raion,nuclear_reactor_raion,detention_facility_raion,water_1line,big_road1_1line,railroad_1line,ecology
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,3.0
2,3,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0.0
3,4,0,3,0,0,0,0,0,0,0,0,0,0,0,0,2.0
4,5,0,4,0,0,0,0,1,1,0,0,0,0,0,1,3.0


7. Final check on the data: does it contain all the rows as before?

In [53]:
df_new.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30471 entries, 0 to 30470
Data columns (total 16 columns):
id                           30471 non-null int64
product_type                 30471 non-null int8
sub_area                     30471 non-null int8
culture_objects_top_25       30471 non-null int8
thermal_power_plant_raion    30471 non-null int8
incineration_raion           30471 non-null int8
oil_chemistry_raion          30471 non-null int8
radiation_raion              30471 non-null int8
railroad_terminal_raion      30471 non-null int8
big_market_raion             30471 non-null int8
nuclear_reactor_raion        30471 non-null int8
detention_facility_raion     30471 non-null int8
water_1line                  30471 non-null int8
big_road1_1line              30471 non-null int8
railroad_1line               30471 non-null int8
ecology                      22815 non-null float64
dtypes: float64(1), int64(1), int8(14)
memory usage: 892.8 KB
