# Building the Chicago Model

In [14]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder # to make dummy features
from sklearn.compose import make_column_transformer # to make dummy features
from sklearn.linear_model import LinearRegression # data model
from sklearn.pipeline import make_pipeline # to develop a pipeline
from sklearn.model_selection import cross_val_score # to score the model

In [3]:
df = pd.read_csv('../Datasets/chicago_home_w_region.csv', index_col=0) # read Chicago data

In [5]:
display(df.head()) # view the first 5 rows

Unnamed: 0,price,sqfeet,beds,baths,cats_allowed,dogs_allowed,smoking_allowed,wheelchair_access,electric_vehicle_charge,comes_furnished,laundry_options,parking_options,lat,long,region
80565,2910.0,1500.0,3.0,2.0,1,1,0,0,0,0,w/d in unit,street parking,41.8962,-87.6685,9.0
80776,1200.0,800.0,2.0,1.0,1,0,1,0,0,0,unknown,unknown,41.9019,-87.6779,9.0
80885,2600.0,715.0,1.0,1.0,1,1,0,1,0,0,w/d in unit,attached garage,41.9039,-87.6346,1.0
80893,2100.0,565.0,0.0,1.0,1,1,0,1,0,0,w/d in unit,attached garage,41.9039,-87.6346,1.0
81255,1873.0,547.0,0.0,1.0,1,1,0,1,0,0,w/d in unit,attached garage,41.8967,-87.627,1.0


In [6]:
print(df.shape) # get the data shape

(729, 15)

In [12]:
X = df.drop(['price', 'lat', 'long'], axis=1) # extract independent variables
y = df['price'] # extract the target variable

# build the transformation encoder
column_transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), 
                                              ['region', 'laundry_options', 'parking_options']),
                                             remainder='passthrough')

lr_model = LinearRegression() # instantiate the model

pipeline = make_pipeline(column_transformer, lr_model) # nuild the pipeline

scores = cross_val_score(pipeline, X, y, scoring='r2') # get scores

print('Cross-val scores: {}'.format(scores))
print('Average cross-val score: {}'.format(scores.mean()))

pipeline.fit(X, y)

Cross-val scores: [0.73203703 0.74253047 0.54587459 0.76248539 0.70411099]
Average cross-val score: 0.6974076950990286


Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['region', 'laundry_options',
                                                   'parking_options'])])),
                ('linearregression', LinearRegression())])

In [15]:
# create an empy pandas dataframe we can add to
apartment_info = pd.DataFrame(data=[np.zeros(len(X_test.columns))], columns=X_test.columns)

# fill the dataframe.
apartment_info['sqfeet'] = 1250
apartment_info['beds'] = 2
apartment_info['baths'] = 1
apartment_info['cats_allowed'] = 1
apartment_info['dogs_allowed'] = 1
apartment_info['comes_furnished'] = 1
apartment_info['laundry_options_laundry in bldg'] = 1
apartment_info['parking_options_street parking'] = 1
apartment_info['region_9.0'] = 1

NameError: name 'X_test' is not defined

In [None]:
pipeline.predict

# By All States

In [3]:
def subset_by_state(state):
    """Seperate the states from the rest"""

    # seperate the state from the others
    subset = df[df['state'] == state]
    
    #return a dataframe for the desired state
    return subset

In [4]:
d = {}
for state in df.state.unique():
    d["{}_df".format(state)] = subset_by_state(state)

In [17]:
piplines = {}

for key in d.keys():
    
    X = d[str(key)].drop(['price', 'state', 'lat', 'long'], axis=1)
    y = d[str(key)]['price']

    column_transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), 
                                              ['region', 'type', 'laundry_options', 'parking_options']),
                                             remainder='passthrough')

    lr_model = LinearRegression()

    pipeline = make_pipeline(column_transformer, lr_model)

    piplines[str(key)] = pipeline.fit(X, y)

In [26]:
coordinates = df[['lat', 'long', 'price']].values

In [27]:
coordinates

array([[  39.5483, -119.796 , 1148.    ],
       [  39.5026, -119.789 , 1200.    ],
       [  39.6269, -119.708 , 1813.    ],
       ...,
       [  39.5358, -119.746 , 1249.    ],
       [  39.5585, -119.703 , 1429.    ],
       [  39.4477, -119.771 , 1295.    ]])

In [None]:
# Plot the data for visualization
import folium # import folium

national_map = folium.Map(location=[41.8641,-87.6298],
                         tiles="CartoDB dark_matter",
                         zoom_start=11) # build the Chicago map

# assign a color to each cluster
for x, y, price in coordinates:
    # add the colored icons to our map
    folium.Marker(location=[x,y],
                          icon=folium.Icon(icon_color='white', icon='home'),
                          popup=price).add_to(national_map)

# view the map. Here we can see the locations of all the apartments. Click on the home icon and you will see the price
national_map 

In [None]:
r_scores = pd.DataFrame([*zip([item for item in d.keys()], [item for item in scores])],
                       columns=['state', 'r2_score'])

r_scores.sort_values('r2_score', ascending=False)

In [None]:
column_transformer = make_column_transformer((OneHotEncoder(handle_unknown='ignore'), 
                                              ['region', 'type', 'laundry_options', 'parking_options']),
                                             remainder='passthrough')

In [None]:
lr_model = LinearRegression()

In [None]:
pipeline = make_pipeline(column_transformer, lr_model)

In [None]:
cross_val_score(pipeline, X, y, cv=5, scoring='r2')

In [None]:
pipeline.fit(X, y)

In [None]:
d['IL_df']['type'].value_counts()

In [None]:
y[:20]

In [None]:
pipeline.predict(X)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.scatter(pipeline.predict(X), y)

In [None]:
X.isna().sum()

In [None]:
d['IL_df'].reset_index(inplace=True)

ohe = OneHotEncoder() # istantiate the one hot encoder

# fit and transform the data
ohe_trans = ohe.fit_transform(d['IL_df'][['region', 'type', 'laundry_options', 'parking_options']]).toarray()

In [None]:
cat_df = pd.DataFrame(ohe_trans, columns=ohe.get_feature_names())

In [None]:
encoded_df = pd.merge(cat_df, d['IL_df'].drop(['region',
                                               'type',
                                               'laundry_options',
                                               'parking_options',
                                               'state'], axis=1), left_index=True, right_index=True)

In [None]:
X = df_with_dummies.drop('price', axis=1).reset_index(drop=True) # extract the independent variables
y = df_with_dummies['price'].reset_index(drop=True) # extract the dependent variable

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
column_trans = ColumnTransformer(OneHotEncoder(), 
                                 (d['IL_df'],['region', 'type', 'laundry_options', 'parking_options']),
                                 remainder='passthrough')

In [None]:
column_trans.fit_transform(d['IL_df'])

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
ohe_trans.shape

In [None]:
d['IL_df'][['region', 'type', 'laundry_options', 'parking_options']] = ohe_trans

In [None]:
ohe.fit_transform(d['IL_df'][['region', 'type', 'laundry_options', 'parking_options']]).toarray()

In [None]:
objects = pd.DataFrame(model, columns=ohe.get_feature_names())

In [None]:
d['IL_df']

In [None]:
objects.merge

In [None]:
ohe.get_feature_names()

In [None]:
fitted_model.get_feature_names

In [None]:
d['IL_df'].select_dtypes('object')

In [None]:
ohe = OneHotEncoder()

ohe.fit(d['IL_df'])

In [None]:
ohe.transform(d['IL_df']).toarray()

In [None]:
ohe.get_feature_names().shape

In [None]:
d['IL_df']

In [None]:
ohe

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
scores = []

for key in d.keys():
        
    X = d[str(key)].drop(['price', 'state'], axis=1)
    y = d[str(key)]['price']
    
    dummy = pd.get_dummies(X)
    
    #X_train, X_test, y_train, y_test = train_test_split(dummy, y, test_size=0.2)
    
    model = LinearRegression()
    
    model.fit(X_train, y_train)
        
    scores.append(cross_val_score(model, dummy, y, cv=5, scoring='r2'))

In [None]:

        
X = d['HI_df'].drop(['price', 'state'], axis=1)
y = d['HI_df']['price']
    
dummy = pd.get_dummies(X)
    
    #X_train, X_test, y_train, y_test = train_test_split(dummy, y, test_size=0.2)
    
model = LinearRegression()
            
cross_val_score(model, dummy, y, cv=5, scoring='r2')

In [None]:
model.coef_

In [None]:
r_scores = pd.DataFrame([*zip([item for item in d.keys()], [item for item in scores])],
                       columns=['state', 'r2_score'])

r_scores.sort_values('r2_score', ascending=False).head(10)

In [None]:
d['WY_df']

In [None]:
scores

In [None]:
X = d['IL_df'].drop(['price', 'state'], axis=1)
y = d['IL_df']['price']

In [None]:
dummy = pd.get_dummies(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dummy, y, test_size=0.2)

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
r_scores = pd.DataFrame([*zip([item for item in d.keys()], [item for item in scores])],
                       columns=['state', 'r2_score'])

r_scores.sort_values('r2_score', ascending=False)