# Building the Chicago Model Pipeline

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder # to make dummy features
from sklearn.compose import make_column_transformer # to facilitate the building of a pipeline
from sklearn.linear_model import LinearRegression # data model
from sklearn.pipeline import make_pipeline # to build a pipeline
from sklearn.model_selection import cross_val_score # to score the model
from sklearn.preprocessing import LabelEncoder # to encode categorical variables

In [2]:
chicago_df = pd.read_csv('../Datasets/chicago_home_w_region.csv', index_col=0) # read Chicago data

In [3]:
display(chicago_df.head()) # view the first 5 rows

Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,lat,long,region
80565,2910.0,1500.0,3.0,2.0,1,1,0,0,0,0,w/d in unit,street parking,41.8962,-87.6685,9.0
80776,1200.0,800.0,2.0,1.0,1,0,1,0,0,0,unknown,unknown,41.9019,-87.6779,9.0
80885,2600.0,715.0,1.0,1.0,1,1,0,1,0,0,w/d in unit,attached garage,41.9039,-87.6346,1.0
80893,2100.0,565.0,0.0,1.0,1,1,0,1,0,0,w/d in unit,attached garage,41.9039,-87.6346,1.0
81255,1873.0,547.0,0.0,1.0,1,1,0,1,0,0,w/d in unit,attached garage,41.8967,-87.627,1.0


In [4]:
print(chicago_df.shape) # get the data shape

(729, 15)


In [5]:
X = chicago_df.drop(['price', 'lat', 'long'], axis=1) # extract independent variables
y = chicago_df['price'] # extract the target variable

column_transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'),
                                                ['region', 'laundry_options', 'parking_options']), 
                                                remainder='passthrough')

lr_model = LinearRegression() # instantiate the model

chicago_pipeline = make_pipeline(column_transformer ,lr_model) # build the pipeline

scores = cross_val_score(chicago_pipeline, X, y, scoring='r2') # get scores

print('Cross-val scores: {}'.format(scores))
print('Average cross-val score: {}'.format(round(scores.mean(), 4)))

chicago_pipeline.fit(X, y)

Cross-val scores: [0.73203703 0.74253047 0.54587459 0.76248539 0.70411099]
Average cross-val score: 0.6974


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['region', 'laundry_options',
                                                   'parking_options'])])),
                ('linearregression', LinearRegression())])

In [6]:
# create an empy pandas dataframe we can add to
apartment_info = pd.DataFrame(data=[np.zeros(len(X.columns))], columns=X.columns)

# fill the dataframe.
apartment_info['sqfeet'] = 1250
apartment_info['beds'] = 2
apartment_info['baths'] = 1
apartment_info['cats_allowed'] = 1
apartment_info['dogs_allowed'] = 1
apartment_info['comes_furnished'] = 1
apartment_info['laundry_options'] = 'laundry in bldg'
apartment_info['parking_options'] = 'street parking'
apartment_info['region'] = 9

In [7]:
apartment_info

Unnamed: 0,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,region
0,1250,2,1,1,1,0.0,0.0,0.0,1,laundry in bldg,street parking,9


In [8]:
chicago_pipeline.predict(apartment_info)[0]

1923.7011984016979

In [9]:
print('The predicted price is ${} per month'.format(round(chicago_pipeline.predict(apartment_info)[0])))

The predicted price is $1924 per month


# Building the National Model Pipeline

In [10]:
# read the cleaned data into a pandas dataframe
df1 = pd.read_csv('../Datasets/clean_housing_1.csv', index_col=0)
df2 = pd.read_csv('../Datasets/clean_housing_2.csv', index_col=0)

# concatenate the data
national_df = pd.concat([df1, df2])

print(national_df.shape) # confirm the shape

(367357, 17)


In [11]:
display(national_df.sample(5)) # view 5 random samples

Unnamed: 0,region,price,type,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,lat,long,state
145452,ann arbor,1448.0,apartment,476.0,0.0,1.0,1,1,0,1,1,0,w/d in unit,attached garage,42.2517,-83.7232,MI
366264,little rock,635.0,apartment,958.0,1.0,1.0,0,0,1,0,0,0,w/d hookups,unknown,34.8807,-92.2303,AR
353156,mobile,630.0,apartment,700.0,1.0,1.0,1,1,1,0,0,0,laundry on site,unknown,30.6762,-88.1844,AL
13144,ventura county,2150.0,apartment,893.0,2.0,2.0,1,1,1,0,0,0,w/d in unit,carport,34.2712,-119.207,CA
169165,jackson,750.0,apartment,896.0,1.0,1.0,1,1,1,1,0,0,w/d hookups,off-street parking,32.2221,-90.2319,MS


In [12]:
def subset_by_state(state):
    """Seperate the states from the rest"""

    # seperate the state from the others
    subset = national_df[national_df['state'] == state]
    
    #return a dataframe for the desired state
    return subset

In [13]:
# create a dictionary to store the dataframes for each state 
d = {}
for state in national_df.state.unique():
    d["{}_df".format(state)] = subset_by_state(state)

In [14]:
pipelines = {}

for key in d.keys():
    
    X = d[str(key)].drop(['price', 'state', 'lat', 'long'], axis=1)
    y = d[str(key)]['price']

    column_transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), 
                                              ['region', 'type', 'laundry_options', 'parking_options']),
                                             remainder='passthrough')

    lr_model = LinearRegression()

    pipeline = make_pipeline(column_transformer, lr_model)

    pipelines[str(key)] = [pipeline.fit(X, y), X, y]

In [15]:
# here are the fitted models stored into a dictionary with the appropriate data
print('Here is our fitted model for California: \n', pipelines['CA_df'][0], '\n')
print('Here are the independent variables for California: \n', pipelines['CA_df'][1], '\n')
print('Here are the housing prices for California: \n', pipelines['CA_df'][2], '\n')

Here is our fitted model for California: 
 Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['region', 'type',
                                                   'laundry_options',
                                                   'parking_options'])])),
                ('linearregression', LinearRegression())]) 

Here are the independent variables for California: 
               region       type  sqfeet  beds  baths  cats_allowed  \
0       reno / tahoe  apartment  1078.0   3.0    2.0             1   
1       reno / tahoe      condo  1001.0   2.0    2.0             0   
2       reno / tahoe  apartment  1683.0   2.0    2.0             1   
3       reno / tahoe  apartment   708.0   1.0    1.0             1   
5       reno / tahoe  ap

In [16]:
# we can now make our predictions using our national models. Let's see what price we get for our desired Chicago unit

In [17]:
# create an empy pandas dataframe we can add to
national_apartment_info = pd.DataFrame(data=[np.zeros(len(X.columns))], columns=X.columns)

# fill the dataframe. The data will be a little different since we are using the national model
national_apartment_info['region'] = 'chicago'
national_apartment_info['type'] = 'apartment'
national_apartment_info['sqfeet'] = 1250
national_apartment_info['beds'] = 2
national_apartment_info['baths'] = 1
national_apartment_info['cats_allowed'] = 1
national_apartment_info['dogs_allowed'] = 1
national_apartment_info['comes_furnished'] = 1
national_apartment_info['laundry_options'] = 'laundry in bldg'
national_apartment_info['parking_options'] = 'street parking'

In [18]:
# we will use the Illinois model to get the price
print('The statewide model predicts the price to be ${}'\
      .format(round(pipelines['IL_df'][0].predict(national_apartment_info)[0])))

The statewide model predicts the price to be $1862


In [19]:
scores = cross_val_score(pipelines['IL_df'][0], pipelines['IL_df'][1], pipelines['IL_df'][2], scoring='r2')
print('All r2 scores:', scores)
print('Average r2 score:', round(scores.mean(), 4))

All r2 scores: [0.26338422 0.60235769 0.39579624 0.19720403 0.47088651]
Average r2 score: 0.3859


In [20]:
# The model performs better at an individual region level. It is more likely that the previous price/model is closer 
# to the correct price. This is likely due to the differences in price across the regions within a state and the 
# limitations of the linear regression model

# Model Deployment

In [21]:
# we now have a deployable pipeline we can put into an application

In [22]:
import joblib # import joblib

joblib.dump(chicago_pipeline, '../Models/chicago_model.ml') # save the Chicago model for deployment

['../Models/chicago_model.ml']