In [1]:
import numpy as np
import pandas as pd

In [2]:
home_prices = 'C:/Users/G331623/OneDrive - Principal Financial Group/Data Science/python git/Pandas/Data/USA_Housing.csv'

In [3]:
df = pd.read_csv(home_prices)

In [6]:
def size(n):
    if n < 4:
        return 'Small'
    elif 4 < n < 6:
        return 'Medium'
    else:
        return 'Big'

df['House_size'] = df['Avg. Area Number of Rooms'].apply(size)

df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",Big
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",Big
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",Big
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,Medium
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,Big


### Pipeline

In [27]:
# Data cleaning, Scaling and Validation required before applying Model
# Create pipelines for these steps instead of doing it in a separate step to avoid errors

In [8]:
import pdpipe as pdp

In [30]:
# PIPELINE
    # Drop extra columns (ColDrop) --> 
        # One hot encoding (OneHotEncode) --> 
            # Drop rows with price < $2,50,000 (ValDrop) -->
                    # For doing above:
                        # Tag the rows to be removed (tenmp tagging column)
                        # Drop rows based on tag
                        # Drop the temp tag column

# Drops column from a DataFrame
pipeline = pdp.ColDrop('Avg. Area House Age')

# One hot encoding
pipeline += pdp.OneHotEncode('House_size')


# Drop rows based on price < $2,50,000
def price_tag(x):
    return 'keep' if x > 250000 else 'drop'
    

pipeline += pdp.ApplyByCols('Price', price_tag, 'Price_tag', drop = False)
pipeline += pdp.ValDrop(['drop'],'Price_tag') 
pipeline += pdp.ColDrop('Price_tag')

df2 = pipeline(df)
df2.head()

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
0,79545.458574,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",0,0
1,79248.642455,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA...",0,0
2,61287.067179,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,63345.240046,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820,1,0
4,59982.197226,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386,0,0


### Scaling in Pipelining - method of Scikit Learn

In [37]:
pipeline_scale = pdp.Scale('StandardScaler', exclude_columns=['House_size_Medium','House_size_Small'])

df6 = pipeline_scale(df2)

df6.head()

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address,House_size_Medium,House_size_Small
0,1.028113,0.019595,0.087245,-1.32281,-0.500532,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701...",0,0
1,1.000175,-0.257485,-0.722671,0.401923,0.775998,"188 Johnson Views Suite 079\nLake Kathleen, CA...",0,0
2,-0.690443,1.516179,0.929559,0.06973,-0.500662,"9127 Elizabeth Stravenue\nDanieltown, WI 06482...",0,0
3,-0.496719,-1.396285,-0.584986,-0.189886,0.075327,USS Barnett\nFPO AP 44820,1,0
4,-0.813263,0.845954,0.200634,-0.992999,-1.723449,USNS Raymond\nFPO AE 09386,0,0


### Tokenize in Pipelining - Method of NLTK

In [53]:
# Fetch Pin Code and State from Address field

def extract_state(token):
    return str(token[-2])

# Apply tokenization on a particular Column
# Tokenize the Address column
pipeline_tokenize = pdp.TokenizeWords('Address')

# Extract STATE from Address tokens
pipeline_state = pdp.ApplyByCols('Address', extract_state, result_columns = 'State')

# Making a single Pipeline
pipeline_state_extract = pipeline_tokenize + pipeline_state


df7 = pipeline_state_extract(df6)
df7.head()


# Other way of doing it in one line
#pipeline_tokenize += pdp.ApplyByCols('Address', extract_state, result_columns = 'State')
#df7 = pipeline_tokenize(df6)

Unnamed: 0,Avg. Area Income,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,State,House_size_Medium,House_size_Small
0,1.028113,0.019595,0.087245,-1.32281,-0.500532,NE,0,0
1,1.000175,-0.257485,-0.722671,0.401923,0.775998,CA,0,0
2,-0.690443,1.516179,0.929559,0.06973,-0.500662,WI,0,0
3,-0.496719,-1.396285,-0.584986,-0.189886,0.075327,AP,1,0
4,-0.813263,0.845954,0.200634,-0.992999,-1.723449,AE,0,0
