In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

In [15]:
# Load Montgomery data into the dataframe.
montgomery_df = pd.read_csv('house_info_school_assignments_dropna_transform.csv')
montgomery_df

Unnamed: 0,price,bathrooms,bedrooms,livingArea,homeType,lotAreaValue,zipcode,elem,middle,high
0,800000,4.0,5.0,3650.0,SINGLE_FAMILY,222156.0,20854,wayside elementary,herbert hoover middle,winston churchill high
1,800000,1.0,2.0,1400.0,SINGLE_FAMILY,10890.0,20814,kensington parkwood elementary,tilden middle,walter johnson high
2,800000,3.0,4.0,1826.0,SINGLE_FAMILY,9148.0,20817,bethesda elementary,tilden middle,walt whitman high
3,800000,3.0,4.0,2810.0,SINGLE_FAMILY,9811.0,20854,ritchie park elementary,cabin john middle,winston churchill high
4,800000,4.0,3.0,2304.0,SINGLE_FAMILY,6570.0,20910,rosemary hills elementary,tilden middle,northwood high
...,...,...,...,...,...,...,...,...,...,...
11413,358000,3.0,3.0,1673.0,TOWNHOUSE,2001.0,20879,strawberry knoll elementary,gaithersburg middle,gaithersburg high
11414,358500,4.0,5.0,1386.0,TOWNHOUSE,1650.0,20877,flower hill elementary,gaithersburg middle,gaithersburg high
11415,358888,3.0,3.0,1420.0,TOWNHOUSE,2160.0,20878,summit hall elementary,lakelands park middle,gaithersburg high
11416,359000,2.0,3.0,1188.0,TOWNHOUSE,1206.0,20878,dufief elementary,lakelands park middle,quince orchard high


In [16]:
# Load school ratings data.
mcps_school_df = pd.read_csv('mcps_school_ratings_clean.csv')

# Keep only the school name and rating.
mcps_school_df = mcps_school_df[['School', 'Rating']]

In [17]:
montgomery_df = montgomery_df.merge(mcps_school_df, left_on='elem', right_on='School')
montgomery_df = montgomery_df.rename(columns={'Rating':'elemRating'})
montgomery_df = montgomery_df.drop(columns=['elem', 'School'])

montgomery_df = montgomery_df.merge(mcps_school_df, left_on='middle', right_on='School')
montgomery_df = montgomery_df.rename(columns={'Rating':'middleRating'})
montgomery_df = montgomery_df.drop(columns=['middle', 'School'])

montgomery_df = montgomery_df.merge(mcps_school_df, left_on='high', right_on='School')
montgomery_df = montgomery_df.rename(columns={'Rating':'highRating'})
montgomery_df = montgomery_df.drop(columns=['high', 'School'])

In [18]:
# Load zip code data.
zip_df = pd.read_csv('md_zip_tabulations.csv')

# Keep only certain columns
zip_df = zip_df[['Census 5-Digit ZCTA Code', 
                 'Percent Non Hispanic White',
                 'Percent Non Hispanic Black',
                 'Percent non Hispanic Asian',
                 'Percent Hispanic']]

zip_df

Unnamed: 0,Census 5-Digit ZCTA Code,Percent Non Hispanic White,Percent Non Hispanic Black,Percent non Hispanic Asian,Percent Hispanic
0,20601,40.5,46.1,3.6,5.3
1,20602,33.9,52.3,2.9,6.1
2,20603,33.2,51.9,5.0,5.5
3,20606,87.5,10.4,0.9,0.9
4,20607,22.1,64.5,5.7,4.9
...,...,...,...,...,...
408,21918,95.6,1.3,0.7,1.1
409,21919,95.8,1.2,0.2,1.2
410,21920,88.7,6.6,0.0,0.8
411,21921,82.5,8.9,1.5,4.5


In [19]:
montgomery_df = montgomery_df.merge(zip_df, left_on='zipcode', right_on='Census 5-Digit ZCTA Code')
montgomery_df = montgomery_df.rename(columns={'Percent Non Hispanic White': 'percentWhite',
                                              'Percent Non Hispanic Black': 'percentBlack',
                                              'Percent non Hispanic Asian': 'percentAsian',
                                              'Percent Hispanic': 'percentHispanic'})
montgomery_df = montgomery_df.drop(columns=['zipcode', 'Census 5-Digit ZCTA Code'])
montgomery_df

Unnamed: 0,price,bathrooms,bedrooms,livingArea,homeType,lotAreaValue,elemRating,middleRating,highRating,percentWhite,percentBlack,percentAsian,percentHispanic
0,800000,4.0,5.0,3650.0,SINGLE_FAMILY,222156.000000,8.0,9.0,8.0,69.0,4.5,18.2,5.5
1,802909,2.0,5.0,2188.0,SINGLE_FAMILY,77513.999999,8.0,9.0,8.0,69.0,4.5,18.2,5.5
2,825000,3.0,4.0,3050.0,SINGLE_FAMILY,87120.000000,8.0,9.0,8.0,69.0,4.5,18.2,5.5
3,1120000,6.0,5.0,5399.0,SINGLE_FAMILY,111949.000016,8.0,9.0,8.0,69.0,4.5,18.2,5.5
4,1150000,5.0,5.0,5168.0,SINGLE_FAMILY,145054.999984,8.0,9.0,8.0,69.0,4.5,18.2,5.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
11413,675000,4.0,5.0,3846.0,SINGLE_FAMILY,30771.000014,6.0,4.0,5.0,30.4,42.6,9.0,14.8
11414,650000,3.0,3.0,2594.0,SINGLE_FAMILY,267894.000000,6.0,4.0,5.0,30.4,42.6,9.0,14.8
11415,1250000,7.0,7.0,8170.0,SINGLE_FAMILY,87991.000016,6.0,4.0,5.0,30.4,42.6,9.0,14.8
11416,745000,4.0,5.0,5300.0,SINGLE_FAMILY,87991.000016,6.0,4.0,5.0,30.4,42.6,9.0,14.8


In [20]:
# homeType is categorical variable. Create dummy variables.
montgomery_df = pd.get_dummies(montgomery_df, columns=["homeType"])
montgomery_df = montgomery_df.dropna()
montgomery_df

Unnamed: 0,price,bathrooms,bedrooms,livingArea,lotAreaValue,elemRating,middleRating,highRating,percentWhite,percentBlack,percentAsian,percentHispanic,homeType_CONDO,homeType_SINGLE_FAMILY,homeType_TOWNHOUSE
0,800000,4.0,5.0,3650.0,222156.000000,8.0,9.0,8.0,69.0,4.5,18.2,5.5,0,1,0
1,802909,2.0,5.0,2188.0,77513.999999,8.0,9.0,8.0,69.0,4.5,18.2,5.5,0,1,0
2,825000,3.0,4.0,3050.0,87120.000000,8.0,9.0,8.0,69.0,4.5,18.2,5.5,0,1,0
3,1120000,6.0,5.0,5399.0,111949.000016,8.0,9.0,8.0,69.0,4.5,18.2,5.5,0,1,0
4,1150000,5.0,5.0,5168.0,145054.999984,8.0,9.0,8.0,69.0,4.5,18.2,5.5,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11413,675000,4.0,5.0,3846.0,30771.000014,6.0,4.0,5.0,30.4,42.6,9.0,14.8,0,1,0
11414,650000,3.0,3.0,2594.0,267894.000000,6.0,4.0,5.0,30.4,42.6,9.0,14.8,0,1,0
11415,1250000,7.0,7.0,8170.0,87991.000016,6.0,4.0,5.0,30.4,42.6,9.0,14.8,0,1,0
11416,745000,4.0,5.0,5300.0,87991.000016,6.0,4.0,5.0,30.4,42.6,9.0,14.8,0,1,0


In [21]:
# Separate features and target.
X = montgomery_df.drop(columns=['price'])
Y = montgomery_df['price']
print (f'Separating features and target: {X.shape} | {Y.shape}')

Separating features and target: (11263, 14) | (11263,)


In [22]:
# Split data into training and testing sets. 80% data is used for training
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=100)
print(f'After splitting data: X train:{X_train.shape}, Y train: {Y_train.shape}, \
X test: {X_test.shape}, Y test: {Y_test.shape}')

After splitting data: X train:(9010, 14), Y train: (9010,), X test: (2253, 14), Y test: (2253,)


In [23]:
# Instantiate a linear regression model and fit on training dataset.
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression()

In [24]:
# Make predictions on testing data.
Y_pred = lin_model.predict(X_test)

In [25]:
# Print the model coefficients.
print (f'Model intercept: {lin_model.intercept_}, Model coefficients: {lin_model.coef_}')

Model intercept: -3320702.1335540167, Model coefficients: [ 6.50782785e+04  9.11571882e+03  1.04591294e+02  6.36901574e-02
  4.60110616e+03  5.60981411e+03  5.40088253e+04  3.38381681e+04
  2.88889265e+04  2.58355945e+04  3.29739344e+04 -6.56711554e+04
  9.59734283e+04 -3.03022729e+04]


In [26]:
# Print RMSE and accuracy.

# The mean squared error
print (f'Mean squared error: {mean_squared_error(Y_test, Y_pred):.2f}')

# Model accuracy:
print (f'Training accuracy: {lin_model.score(X_train, Y_train):.2f}')
print (f'Testing accuracy: {lin_model.score(X_test, Y_test):.2f}')

Mean squared error: 48671898581.47
Training accuracy: 0.76
Testing accuracy: 0.68


In [27]:
# The sklearn linear model fit doesn't give p-value information. To get this info and display
# it in a table, we will use statsmodels instead. 
# Source: https://stackoverflow.com/questions/27928275/find-p-value-significance-in-scikit-learn-linearregression
import statsmodels.api as sm

X2_train = sm.add_constant(X_train)
lin_mod = sm.OLS(Y_train,X2_train)

fii = lin_mod.fit()
summary = fii.summary2()
print (summary)

                              Results: Ordinary least squares
Model:                       OLS                     Adj. R-squared:            0.755      
Dependent Variable:          price                   AIC:                       245030.1977
Date:                        2021-09-16 22:27        BIC:                       245129.6829
No. Observations:            9010                    Log-Likelihood:            -1.2250e+05
Df Model:                    13                      F-statistic:               2141.      
Df Residuals:                8996                    Prob (F-statistic):        0.00       
R-squared:                   0.756                   Scale:                     3.7813e+10 
-------------------------------------------------------------------------------------------
                           Coef.       Std.Err.     t    P>|t|      [0.025        0.975]   
-------------------------------------------------------------------------------------------
const             

In [28]:
# Now, instantiate the Random Forrest model.

# Test out different random forrest models accuracy against different number of estimators.

x_val = []
y_val = []

for n in range(3, 31):
    rf_model = RandomForestRegressor(criterion='mae', n_estimators=n, random_state=50)
    rf_model = rf_model.fit(X_train, Y_train)
    Y_pred = rf_model.predict(X_test)
    
    x_val.append(n)
    y_val.append(rf_model.score(X_test, Y_test))

    # The mean squared error
    print (f'n {n} | MSE {mean_squared_error(Y_test, Y_pred):.2f} | \
    Training Accuracy {rf_model.score(X_train, Y_train):.2f} | Testing Accuracy {rf_model.score(X_test, Y_test):.2f}')


n 3 | MSE 28660646425.82 |     Training Accuracy 0.95 | Testing Accuracy 0.81
n 4 | MSE 27610975839.04 |     Training Accuracy 0.96 | Testing Accuracy 0.82
n 5 | MSE 26467827696.33 |     Training Accuracy 0.96 | Testing Accuracy 0.83
n 6 | MSE 25358382493.50 |     Training Accuracy 0.97 | Testing Accuracy 0.83
n 7 | MSE 24611472308.32 |     Training Accuracy 0.97 | Testing Accuracy 0.84
n 8 | MSE 24002497245.14 |     Training Accuracy 0.97 | Testing Accuracy 0.84
n 9 | MSE 23854230869.81 |     Training Accuracy 0.97 | Testing Accuracy 0.84
n 10 | MSE 23732961939.58 |     Training Accuracy 0.97 | Testing Accuracy 0.84
n 11 | MSE 23784154943.17 |     Training Accuracy 0.97 | Testing Accuracy 0.84
n 12 | MSE 23411437744.25 |     Training Accuracy 0.97 | Testing Accuracy 0.85
n 13 | MSE 23284622558.53 |     Training Accuracy 0.98 | Testing Accuracy 0.85
n 14 | MSE 23136417413.40 |     Training Accuracy 0.98 | Testing Accuracy 0.85
n 15 | MSE 23006486159.23 |     Training Accuracy 0.98 | Te