Objective: to predict housing prices in New York City using various regression models

Data: NYC housing data from Kaggle, originally from Zillow API as of January 2021

1. Imported data

2. Cleaned dataset to be more readible, to ensure right data types for regression, and to drop outliers

3. Ran LASSO Regression 100 times to optimize and get model with greatest R^2 value -- resulting R^2 score of around 0.45

4. Ran Random Forest Regression to get model -- resulting R^2 of around 0.76
    
5. Exported Random Forest Regression model for later use, then loaded model to run on test data

In [1]:
import os
import pandas as pd
import numpy as np
from scipy import stats
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

In [2]:
df = pd.read_csv(os.path.join("..", "clean_ny_housing.csv"))

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
# cleaning the dataset

def makeAdjustments(df):
    
    # changing photo columns to a single count column -- 'photos/'
    photo_count = []
    for i in range(len(df)):
        temp = 0
        for j in range(81): #81 photo columns in df
            col = 'photos/' + str(j)
            if pd.notna(df[col][i]):
                temp += 1
        photo_count.append(temp)
    df['photo_count'] = photo_count
    df = df.drop(df.loc[:,'photos/0':'photos/80'], axis = 1)
    
    # changing appliance columns to a single count column -- 'resoFactsStats/appliances/'
    appliance_count = []
    for i in range(len(df)):
        temp = 0
        for j in range(11): #11 photo columns in df
            col = 'resoFactsStats/appliances/' + str(j)
            if pd.notna(df[col][i]):
                temp += 1
        appliance_count.append(temp)
    df['appliance_count'] = photo_count
    df = df.drop(df.loc[:, 'resoFactsStats/appliances/0':'resoFactsStats/appliances/10'].columns, axis = 1)
    
    # changing community features columns to a single count column -- 'resoFactsStats/communityFeatures/0'
    commfeature_count = []
    for i in range(len(df)):
        temp = 0
        for j in range(6): #6 photo columns in df
            col = 'resoFactsStats/communityFeatures/' + str(j)
            if pd.notna(df[col][i]):
                temp += 1
        commfeature_count.append(temp)
    df['commfeature_count'] = photo_count
    df = df.drop(df.loc[:, 'resoFactsStats/communityFeatures/0':'resoFactsStats/communityFeatures/5'].columns, axis = 1)
    
    # remove not heating from 'resoFactsStats/atAGlanceFacts/2/factLabel' 
    # remove not cooling from 'resoFactsStats/atAGlanceFacts/3/factLabel'
    # remove not parking from 'resoFactsStats/atAGlanceFacts/4/factValue'
    # remove not days on zillow from 'resoFactsStats/atAGlanceFacts/6/factLabel'
    todrop = []
    for i in range(len(df)):
        if(
            df['resoFactsStats/atAGlanceFacts/2/factLabel'][i] != 'Heating' or 
            df['resoFactsStats/atAGlanceFacts/3/factLabel'][i] != 'Cooling' or
            df['resoFactsStats/atAGlanceFacts/4/factLabel'][i] != 'Parking' or
            df['homeStatus'][i] != 'FOR_SALE'
        ):
                todrop.append(i)
    df = df.drop(todrop, axis=0)
    df = df.drop(['resoFactsStats/atAGlanceFacts/2/factLabel', 'resoFactsStats/atAGlanceFacts/3/factLabel', 'resoFactsStats/atAGlanceFacts/4/factLabel', 
                  'homeStatus'], axis=1)
    
    # renaming columns above and reassigning as booleans
    df = df.rename(columns={
        'resoFactsStats/atAGlanceFacts/2/factValue': 'Heating',
        'resoFactsStats/atAGlanceFacts/3/factValue': 'Cooling',
        'resoFactsStats/atAGlanceFacts/4/factValue': 'Parking',
    })
    
    df['Heating'] = [pd.notna(val) for val in df['Heating']]
    df['Cooling'] = [pd.notna(val) for val in df['Cooling']]
    df['Parking'] = [pd.notna(val) for val in df['Parking']]
    
    # assigning dummy to categorial variables
    df['address/city'].str.lower()
    
    cat_variables = df[['address/city', 'resoFactsStats/atAGlanceFacts/0/factValue']]
    cat_dummies = pd.get_dummies(cat_variables)
    df.drop(['address/city', 'resoFactsStats/atAGlanceFacts/0/factValue'], axis=1, inplace=True)
    df = pd.concat([df, cat_dummies], axis=1)
    
    df = df.drop(['address/zipcode', 'latitude', 'longitude', 'Heating', 'Cooling', 'Parking', 'dateposted'], axis = 1)
    # why is that comma at the end of the drop list raising R^2???
    
    # typecasting any objects to numeric
    df['resoFactsStats/atAGlanceFacts/1/factValue'] = pd.to_numeric(df['resoFactsStats/atAGlanceFacts/1/factValue'])
    
    # drop rows with na
    df = df.dropna()
    
    # columns = ['bedrooms', 'bathrooms', 'livingArea', 'price', 'propertyTaxRate', 'resoFactsStats/atAGlanceFacts/1/factValue', 'photo_count', 'appliance_count', 'commfeature_count']
    # for i in columns:
    #     df2[(np.abs(stats.zscore(df2[i])) < 2)]
    df[(np.abs(stats.zscore(df['bedrooms'])) < 2)]
    df[(np.abs(stats.zscore(df['bathrooms'])) < 2)]
    df[(np.abs(stats.zscore(df['livingArea'])) < 2)]
    df[(np.abs(stats.zscore(df['price'])) < 2)]
    df[(np.abs(stats.zscore(df['propertyTaxRate'])) < 2)]
    df[(np.abs(stats.zscore(df['resoFactsStats/atAGlanceFacts/1/factValue'])) < 2)]
    df[(np.abs(stats.zscore(df['photo_count'])) < 2)]
    df[(np.abs(stats.zscore(df['appliance_count'])) < 2)]
    df[(np.abs(stats.zscore(df['commfeature_count'])) < 2)]

    return df

LASSO Regression

In [4]:
def runRegression(df):
    # creating train and test data set
    train , test = train_test_split(df2, test_size = 0.3)

    x_train = train.drop('price', axis=1)
    y_train = train['price']

    x_test = test.drop('price', axis = 1)
    y_test = test['price']

    # define model
    model = Lasso(alpha=1.0)
    # fit model
    model.fit(x_train, y_train)
    
    # get r^2
    y_pred = model.predict(x_test)
    score = r2_score(y_test, y_pred)
    
    return model, score

In [5]:
maxscore = 0
model = ''
df2 = makeAdjustments(df)
for i in range(100):
    result = runRegression(df)
    if result[1] > maxscore:
        maxscore = result[1]
        model = result[0]

print('score: ', maxscore)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

score:  0.45470483931104055


  model = cd_fast.enet_coordinate_descent(


Random Forest Linear Regression

In [6]:
df2 = makeAdjustments(df)

# creating train and test data set
train , test = train_test_split(df2, test_size = 0.3, random_state=42)

x_train = train.drop('price', axis=1)
y_train = train['price']

x_test = test.drop('price', axis = 1)
y_test = test['price']
    
# fit regressor for random forest linear regression
model = RandomForestRegressor(n_estimators = 100, random_state = 0)
model.fit(x_train, y_train) 
y_pred = model.predict(x_test)

# # scoring using rmse
# mse = mean_squared_error(y_test, y_pred)
# rmse = np.sqrt(mse)
# print('RMSE:', rmse)

# # scoring using mae
# mae = mean_absolute_error(y_test, y_pred)
# print('MAE: ', mae)

# save the model to disk
pickle.dump(model, open('nyc_housing_regression.sav', 'wb'))

In [7]:
loaded_model = pickle.load(open('nyc_housing_regression.sav', 'rb'))
print('score: ', r2_score(y_test, y_pred))

score:  0.7580404997577922
