In [6]:
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_boston
import numpy as np
import pandas as pd

In [7]:
# Reload & clean data
def load_clean(train, test):
    
    df_train = pd.read_csv(train)
    df_test = pd.read_csv(test)
    
    df = pd.concat([df_train, df_test], axis=0)
    
    start = df_test.shape[0]

    # Create a boolean mask for categorical columns
    categorical_mask = (df.dtypes == object)

    # Get list of categorical column names
    categorical_columns = df.columns[categorical_mask].tolist()
    continuous_columns = df.columns[~categorical_mask].tolist()

    # Fill NaNs
    df[categorical_columns] = df[categorical_columns].fillna("null_value")
    df[continuous_columns] = df[continuous_columns].fillna(df.mean())

    df_train = df.iloc[:start+1,:]
    df_test = df.iloc[start+1:,:]
    
    X_train = df_train.drop(['SalePrice'], axis=1)
    y_train = df_train['SalePrice']
    X_test = df_test.drop(['SalePrice'], axis=1)
    y_test = df_test['SalePrice']
    
    return X_train, y_train, X_test, y_test

In [12]:
X, y, X_test, y_test = load_clean("train.csv", "test.csv")

In [32]:
X.select_dtypes(include='object').columns.tolist()

['Alley',
 'BldgType',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'BsmtQual',
 'CentralAir',
 'Condition1',
 'Condition2',
 'Electrical',
 'ExterCond',
 'ExterQual',
 'Exterior1st',
 'Exterior2nd',
 'Fence',
 'FireplaceQu',
 'Foundation',
 'Functional',
 'GarageCond',
 'GarageFinish',
 'GarageQual',
 'GarageType',
 'Heating',
 'HeatingQC',
 'HouseStyle',
 'KitchenQual',
 'LandContour',
 'LandSlope',
 'LotConfig',
 'LotShape',
 'MSZoning',
 'MasVnrType',
 'MiscFeature',
 'Neighborhood',
 'PavedDrive',
 'PoolQC',
 'RoofMatl',
 'RoofStyle',
 'SaleCondition',
 'SaleType',
 'Street',
 'Utilities']

In [18]:
X['BldgType'].unique()

array(['1Fam', '2fmCon', 'Duplex', 'TwnhsE', 'Twnhs'], dtype=object)

In [124]:
from sklearn.base import BaseEstimator, TransformerMixin

class WordLengthExtractor(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts road name column, outputs average word length"""

    def __init__(self):
        pass

    def word_length(self, name):
        """Helper code to compute average word length of a name"""
        x = [len(word[0]) for word in name.str.split()]
        return x
        
    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        df_cat = df.select_dtypes(include='object')
        return df_cat.apply(self.word_length)

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self
    
    
    
class ContainsLetterA(BaseEstimator, TransformerMixin):
    """Takes in dataframe, extracts road name column, outputs average word length"""

    def __init__(self):
        pass

    def contains_a(self, name):
        """Helper code to compute average word length of a name"""
        x = [1 if 'a' in word[0] else 0 for word in name.str.split()]
        return x
        
    def transform(self, df, y=None):
        """The workhorse of this feature extractor"""
        df_cat = df.select_dtypes(include='object')
        return df_cat.apply(self.contains_a)

    def fit(self, df, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [137]:
from sklearn.pipeline import FeatureUnion

fu = [("contains_a", ContainsLetterA()),
      ("word_length", WordLengthExtractor())]
duo_union = FeatureUnion(fu)

duo_union.transform(X).shape

(1460, 86)

In [138]:
conta = ContainsLetterA()
conta.transform(X).shape

(1460, 43)

In [121]:
# X_cat.head()

In [139]:
awd = AverageWordLengthExtractor()

awd.transform(X).shape

(1460, 43)