In [1]:
import pandas as pd
import numpy as np                     
import seaborn as sns     

import matplotlib.pyplot as plt        
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"
pd.options.display.max_columns = 50

import warnings                        
warnings.filterwarnings("ignore")

import data_selector

In [2]:
#parse the data into pandas dataframes
loan_train = pd.read_csv('data/train_u6lujuX_CVtuZ9i.csv')
df = loan_train.drop(['Loan_ID', 'Loan_Status'], axis=1)

df = pd.DataFrame([
    {'price': 8500, 'rooms': 4, 'neighborhood': 'Queen Anne', 'status':1},
    {'price': 7000, 'rooms': 3, 'neighborhood': 'Fremont', 'status':1},
    {'price': 6500, 'neighborhood': 'Wallingford', 'status':0},
    {'price': 6000, 'rooms': 2, 'neighborhood': 'Fremont', 'status':1},
    {'price': 4000, 'rooms': 2, 'neighborhood': 'Fremont', },
    {'price': 7500, 'neighborhood': 'Wallingford', 'status':0},
    {'price': 6500, 'neighborhood': 'Fremont', 'status':1},
])


# Inspect Data

In [3]:
def df_infos(df):
    
    print('Dataframe dimensions:', df.shape)
    #____________________________________________________________
    # gives some infos on columns types and number of null values
    
    df_info = pd.DataFrame(df.dtypes).T.rename(index={0:'column type'})
    df_info = df_info.append(pd.DataFrame(df.isnull().sum()).T.rename(index={0:'Null values'}))
    df_info = df_info.append(pd.DataFrame(df.isnull().sum()/df.shape[0]*100)
                         .T.rename(index={0:'Null values (%)'}))
    df_info = df_info.append(pd.DataFrame([len(df[c].dropna().unique()) for c in df], index=df.columns)
                             .T.rename(index={0:'Number unique'}))
    
    return df_info
df_infos(df)

Dataframe dimensions: (7, 4)


Unnamed: 0,neighborhood,price,rooms,status
column type,object,int64,float64,float64
Null values,0,0,3,1
Null values (%),0,0,42.8571,14.2857
Number unique,3,6,3,2


# Seperate columns by type

In [4]:
"""because the column status has only two values, it may
be a good idea to represent it a category, rathar than float
"""
df['status'] = df['status'].astype('object')

In [5]:
num_cols, cat_cols, date_cols = data_selector.getColumnDataTypes(df)

In [6]:
df_cat = df[cat_cols]
df_cat.head()

Unnamed: 0,neighborhood,status
0,Queen Anne,1.0
1,Fremont,1.0
2,Wallingford,0.0
3,Fremont,1.0
4,Fremont,


In [7]:
df_num = df[num_cols]
df_num.head()

Unnamed: 0,price,rooms
0,8500,4.0
1,7000,3.0
2,6500,
3,6000,2.0
4,4000,2.0


# Missing Value Imputation

In [8]:
df_cat.isnull().sum()

neighborhood    0
status          1
dtype: int64

In [9]:
df_cat_noNA = data_selector.DataFrameImputer().fit_transform(df_cat)
df_cat_noNA.isnull().sum()

neighborhood    0
status          0
dtype: int64

# One-Hot Encoding

There are a number fo options to encode categorical features into unmeric features suitable for machine learning algorithms. 

- ### Using sklearn OneHotEncoder  

This approach requires that we first perform **label encoding (LabelEncoder)** to map each unique category to an integer. Then, we can fit the result of the previous step (will be a numpy array) to the **OneHotEncoder** class.

**Note:** seems like this works on a single feature at a time. To transform an entire dataframe, we need to encode one feature at a time.

- ### Using sklearn DictVectorizer  

The **DictVectorizer** expects a list of dictionary. It will then perform one-hot-encoding on the categorical features, leaving the numeric feature unchanged. **Hence we first need to transform the dataframe into a list of dictionaries**. This can be done using  
```python
df.to_dict( orient = 'records')
```

- ### Using Pandas get_dummies  

The pandas.get_dummies() can actually operate on an entire dataframe. So what we can do here is to select the non-unmeric columns and pass the results to **get_dummies()**  

**Note: The caveat here is that get_dummies will return a dataframe and may not work well with pipelines**

- #### Examples with DictVectorizer

In [10]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer

df

Unnamed: 0,neighborhood,price,rooms,status
0,Queen Anne,8500,4.0,1.0
1,Fremont,7000,3.0,1.0
2,Wallingford,6500,,0.0
3,Fremont,6000,2.0,1.0
4,Fremont,4000,2.0,
5,Wallingford,7500,,0.0
6,Fremont,6500,,1.0


In [11]:
#nan can be converted to float, so no complaint here! 
vec = DictVectorizer(sparse = False, sort=False, dtype=int)
dfNA = df.fillna('NA')
df_to_list_dict = dfNA.to_dict(orient = 'records')
vec.fit_transform(df_to_list_dict)

array([[   1, 8500,    4,    1,    0,    0,    0,    0],
       [   0, 7000,    3,    1,    1,    0,    0,    0],
       [   0, 6500,    0,    0,    0,    1,    1,    0],
       [   0, 6000,    2,    1,    1,    0,    0,    0],
       [   0, 4000,    2,    0,    1,    0,    0,    1],
       [   0, 7500,    0,    0,    0,    1,    1,    0],
       [   0, 6500,    0,    1,    1,    0,    1,    0]])

In [12]:
#see features
vec.get_feature_names()

['neighborhood=Queen Anne',
 'price',
 'rooms',
 'status',
 'neighborhood=Fremont',
 'neighborhood=Wallingford',
 'rooms=NA',
 'status=NA']

In [13]:
#to use dtype=int, we need to encode NaN as something else because nan will not convert to int
vec = DictVectorizer(sparse = False, sort=False, dtype=int)
dfNA = df.fillna('NA')
vec.fit_transform(dfNA.to_dict(orient = 'records'))

array([[   1, 8500,    4,    1,    0,    0,    0,    0],
       [   0, 7000,    3,    1,    1,    0,    0,    0],
       [   0, 6500,    0,    0,    0,    1,    1,    0],
       [   0, 6000,    2,    1,    1,    0,    0,    0],
       [   0, 4000,    2,    0,    1,    0,    0,    1],
       [   0, 7500,    0,    0,    0,    1,    1,    0],
       [   0, 6500,    0,    1,    1,    0,    1,    0]])

- #### Using Pandas get_dummies()

In [14]:
cat_df = df[['status', 'neighborhood']]
df_cat_noNA_dummy = pd.get_dummies(cat_df)
df_cat_noNA_dummy.head()

Unnamed: 0,status_0.0,status_1.0,neighborhood_Fremont,neighborhood_Queen Anne,neighborhood_Wallingford
0,0,1,0,1,0
1,0,1,1,0,0
2,1,0,0,0,1
3,0,1,1,0,0
4,0,0,1,0,0


In [15]:
def numeric_pipeline(df):
    

    df_noNA = data_selector.DataFrameImputer().fit_transform(df)
    df_noNA = pd.get_dummies(df_noNA)
    
    return df_noNA
numeric_pipeline(cat_df)    

Unnamed: 0,status,neighborhood_Fremont,neighborhood_Queen Anne,neighborhood_Wallingford
0,1.0,0,1,0
1,1.0,1,0,0
2,0.0,0,0,1
3,1.0,1,0,0
4,1.0,1,0,0
5,0.0,0,0,1
6,1.0,1,0,0
