In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import dependencies
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [3]:
# Load the data
file_path = Path('./Resources/Clean_MC_Master.csv')
df = pd.read_csv(file_path)
df

Unnamed: 0,MLSNumber,Address,SoldPrice,CurrentPrice,ListDate,SettledDate,#ofStories,City,Zip Code,Subdivision,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,1002388281,9701 Fields Rd #1806,"$127,000","$129,900",11/9/2015,1/4/2016,Main,Gaithersburg,20878,WASHINGTON TOWER CODM,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,1002388133,2211 Washington Ave #W-102,"$202,000","$207,000",11/9/2015,1/4/2016,Main,Silver Spring,20910,ROCK CREEK APTS CODM 2,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,1002384775,3117 University Blvd W #B4,"$139,900","$139,900",10/28/2015,1/4/2016,Main,Kensington,20895,MONTGOMERY CENTURY,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,1002382327,10201 Grosvenor Pl #210,"$195,000","$199,900",10/15/2015,1/4/2016,Main,Rockville,20852,GROSVENOR PARK,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,1002382267,10301 Rossmore Ct,"$840,000","$850,000",10/22/2015,1/4/2016,"Lower1,Lower2,Main,Upper1",Bethesda,20814,WILDWOOD KNOLLS,No,1963,3060.0,4.0,4.0,Yes,Detached
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61797,MDMC2005770,1 Paca Pl,"$625,000","$625,000",7/22/2021,9/16/2021,"Main,Upper1",Rockville,20852,HUNGERFORD,No,1955,2237.0,4.0,3.0,Yes,Detached
61798,MDMC753990,1108 Clagett Dr,"$499,500","$509,000",7/15/2021,9/16/2021,Main,Rockville,20851,ROCKCREST,No,1951,1457.0,3.0,3.0,Yes,Detached
61799,MDMC2003756,11307 Galt Ave,"$410,000","$445,000",7/26/2021,9/16/2021,"Lower1,Main,Upper1",Silver Spring,20902,WHEATON HILLS,No,1950,1872.0,4.0,2.0,No,Detached
61800,MDMC763464,8809 Thomas Lea Ter,"$400,000","$374,900",6/24/2021,9/16/2021,"Lower1,Main,Upper1",Montgomery Village,20886,THE REACH,No,1986,2160.0,4.0,4.0,No,Interior Row/Townhouse


In [4]:
# Drop MLS and Address columns
housing_df = df.drop(['MLSNumber', 'Zip Code', 'Address', 'ListDate', 'SettledDate', 'Subdivision', 'City', 'CurrentPrice', '#ofStories'], axis=1)
housing_df.head()

Unnamed: 0,SoldPrice,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,"$127,000",No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,"$202,000",No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,"$139,900",No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,"$195,000",No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,"$840,000",No,1963,3060.0,4.0,4.0,Yes,Detached


In [5]:
# Convert SoldPrice to numerical
housing_df['SoldPrice'] = housing_df['SoldPrice'].str.replace(',', '').str.replace('$', '').astype(int)
housing_df.head()

Unnamed: 0,SoldPrice,New Construction YN,Age,InteriorSqFt,Bedrooms,Baths,Garage YN,Structure Type
0,127000,No,1966,446.0,0.0,1.0,No,Unit/Flat/Apartment
1,202000,No,1948,671.0,1.0,1.0,No,Unit/Flat/Apartment
2,139900,No,1973,754.0,1.0,1.0,No,Unit/Flat/Apartment
3,195000,No,1972,851.0,1.0,1.0,No,Unit/Flat/Apartment
4,840000,No,1963,3060.0,4.0,4.0,Yes,Detached


In [10]:
# Convert text to numbers
clean_housing_df = pd.get_dummies(housing_df, drop_first = True)
clean_housing_df.head()

Unnamed: 0,SoldPrice,Age,InteriorSqFt,Bedrooms,Baths,New Construction YN_Yes,Garage YN_Yes,Structure Type _End of Row/Townhouse,Structure Type _Garage/Parking Space,Structure Type _Interior Row/Townhouse,Structure Type _Other,Structure Type _Penthouse Unit/Flat/Apartment,Structure Type _Twin/Semi-Detached,Structure Type _Unit/Flat/Apartment
0,127000,1966,446.0,0.0,1.0,0,0,0,0,0,0,0,0,1
1,202000,1948,671.0,1.0,1.0,0,0,0,0,0,0,0,0,1
2,139900,1973,754.0,1.0,1.0,0,0,0,0,0,0,0,0,1
3,195000,1972,851.0,1.0,1.0,0,0,0,0,0,0,0,0,1
4,840000,1963,3060.0,4.0,4.0,0,1,0,0,0,0,0,0,0


In [12]:
# Create features
X = clean_housing_df.drop('SoldPrice', axis=1)

# Create target
y = clean_housing_df['SoldPrice']

In [13]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=100)

In [21]:
# Instantiate a linear regression model and fit on training dataset.
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

LinearRegression()

In [22]:
# Make predictions on testing data.
y_pred = lin_model.predict(X_test)

In [24]:
# Print RMSE and accuracy.

# The mean squared error
print (f'Mean squared error: {mean_squared_error(y_test, y_pred):.2f}')

# Model accuracy:
print (f'Training accuracy: {lin_model.score(X_train, y_train):.2f}')
print (f'Testing accuracy: {lin_model.score(X_test, y_test):.2f}')

Mean squared error: 88932876941.51
Training accuracy: 0.58
Testing accuracy: 0.55


In [25]:
# Create a random forest classifier.
rf_model = RandomForestRegressor(n_estimators=100, random_state=50) 

In [26]:
# Fitting the model
rf_model.fit(X_train, y_train)

RandomForestRegressor(random_state=50)

In [27]:
# Making predictions using the testing data.
y_pred = rf_model.predict(X_test)

In [28]:
# Display results
# The mean squared error
print (f'Mean squared error: {mean_squared_error(y_test, y_pred):.2f}')

# Model accuracy:
print (f'Training accuracy: {rf_model.score(X_train, y_train):.2f}')
print (f'Testing accuracy: {rf_model.score(X_test, y_test):.2f}')

Mean squared error: 57881731208.67
Training accuracy: 0.96
Testing accuracy: 0.70


In [29]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([1.42325888e-01, 3.26769263e-01, 3.11058039e-02, 4.52832888e-01,
       6.37162483e-03, 1.02622977e-02, 2.42804858e-03, 3.78447842e-03,
       1.24678274e-03, 2.99677397e-03, 2.25021576e-04, 1.96511294e-02])

In [30]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.45283288790706727, 'Baths'),
 (0.32676926311416893, 'InteriorSqFt'),
 (0.14232588785475234, 'Age'),
 (0.031105803908322283, 'Bedrooms'),
 (0.01965112935447664, 'Structure Type _Unit/Flat/Apartment'),
 (0.010262297739324296, 'Garage YN_Yes'),
 (0.006371624832307156, 'New Construction YN_Yes'),
 (0.003784478422969912, 'Structure Type _Interior Row/Townhouse'),
 (0.002996773973222297, 'Structure Type _Penthouse Unit/Flat/Apartment'),
 (0.002428048578741688, 'Structure Type _End of Row/Townhouse'),
 (0.0012467827391099337, 'Structure Type _Other'),
 (0.00022502157553724186, 'Structure Type _Twin/Semi-Detached')]