In [1]:
#Nolan Winkler, Predicting Housing Prices

#import libraries and tools
import pandas as pd
import numpy as np
import csv as csv 
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm

#get out the data
df_full = pd.read_csv('trainHousing.csv')

df_full

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.97617,0.0,21.89,0,0.6240,5.757,98.4,2.3460,4,437,21.2,262.76,17.31,15.6
1,0.07875,45.0,3.44,0,0.4370,6.782,41.1,3.7886,5,398,15.2,393.87,6.68,32.0
2,0.04590,52.5,5.32,0,0.4050,6.315,45.6,7.3172,6,293,16.6,396.90,7.60,22.3
3,4.66883,0.0,18.10,0,0.7130,5.976,87.9,2.5806,24,666,20.2,10.48,19.01,12.7
4,0.11432,0.0,8.56,0,0.5200,6.781,71.3,2.8561,5,384,20.9,395.58,7.67,26.5
5,6.71772,0.0,18.10,0,0.7130,6.749,92.6,2.3236,24,666,20.2,0.32,17.44,13.4
6,3.56868,0.0,18.10,0,0.5800,6.437,75.0,2.8965,24,666,20.2,393.37,14.36,23.2
7,0.06724,0.0,3.24,0,0.4600,6.333,17.2,5.2146,4,430,16.9,375.21,7.34,22.6
8,0.26938,0.0,9.90,0,0.5440,6.266,82.8,3.2628,4,304,18.4,393.39,7.90,21.6
9,1.13081,0.0,8.14,0,0.5380,5.713,94.1,4.2330,4,307,21.0,360.17,22.60,12.7


In [2]:
df_full['MEDV']

0      15.6
1      32.0
2      22.3
3      12.7
4      26.5
5      13.4
6      23.2
7      22.6
8      21.6
9      12.7
10     18.6
11     22.2
12     19.0
13     34.9
14     13.1
15     17.6
16     24.1
17     21.2
18     29.6
19     13.6
20     14.9
21     23.1
22     16.2
23     18.7
24     18.0
25     20.6
26     22.9
27     32.5
28      7.2
29     50.0
       ... 
324    15.6
325    24.4
326    15.6
327    18.7
328    23.0
329    11.9
330    20.6
331    19.5
332    21.2
333    29.9
334    30.1
335    25.0
336    24.1
337    24.5
338    20.1
339    20.4
340    22.7
341    21.7
342    22.1
343    17.4
344     8.3
345    24.7
346    20.1
347    20.9
348    15.2
349     8.4
350    18.6
351    18.9
352    30.7
353    33.0
Name: MEDV, dtype: float64

In [3]:
#look at a histogram of quantity we want to predict
plt.hist(df_full.MEDV, bins=10)

(array([  16.,   38.,   53.,  116.,   53.,   30.,   20.,    6.,    8.,   14.]),
 array([  5. ,   9.5,  14. ,  18.5,  23. ,  27.5,  32. ,  36.5,  41. ,
         45.5,  50. ]),
 <a list of 10 Patch objects>)

In [4]:
#See which rows have nulls for value we're predicting
df_full[df_full['MEDV'].isnull()]

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV


In [5]:
#investigate which variables seem to be correlated with median housing value
plt.scatter(df_full.CRIM, df_full.MEDV)
#seemingly have two classes/samples if we get high or low enough: blacks, crime, tax

<matplotlib.collections.PathCollection at 0x7f0a1c11b9b0>

In [6]:
plt.scatter(df_full.RM, df_full.MEDV)
#seemingly linear: room, distance

<matplotlib.collections.PathCollection at 0x7f0a1c0db390>

In [7]:
plt.scatter(df_full.AGE, df_full.MEDV)

<matplotlib.collections.PathCollection at 0x7f0a1c0dbef0>

In [8]:
plt.scatter(df_full.LSTAT, df_full.MEDV)
#logistic

<matplotlib.collections.PathCollection at 0x7f0a1c0ca668>

In [9]:
#turn pandas df into numpy array
df_full.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX          int64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

In [10]:
train_data = df_full.values
train_data

array([[  9.76170000e-01,   0.00000000e+00,   2.18900000e+01, ...,
          2.62760000e+02,   1.73100000e+01,   1.56000000e+01],
       [  7.87500000e-02,   4.50000000e+01,   3.44000000e+00, ...,
          3.93870000e+02,   6.68000000e+00,   3.20000000e+01],
       [  4.59000000e-02,   5.25000000e+01,   5.32000000e+00, ...,
          3.96900000e+02,   7.60000000e+00,   2.23000000e+01],
       ..., 
       [  1.70040000e-01,   1.25000000e+01,   7.87000000e+00, ...,
          3.86710000e+02,   1.71000000e+01,   1.89000000e+01],
       [  7.85700000e-01,   2.00000000e+01,   3.97000000e+00, ...,
          3.84070000e+02,   1.47900000e+01,   3.07000000e+01],
       [  1.95100000e-02,   1.75000000e+01,   1.38000000e+00, ...,
          3.93240000e+02,   8.05000000e+00,   3.30000000e+01]])

In [11]:
#Get testing data too
df_test = pd.read_csv('testHousing.csv')

df_test

test_data = df_full.values
test_data

array([[  9.76170000e-01,   0.00000000e+00,   2.18900000e+01, ...,
          2.62760000e+02,   1.73100000e+01,   1.56000000e+01],
       [  7.87500000e-02,   4.50000000e+01,   3.44000000e+00, ...,
          3.93870000e+02,   6.68000000e+00,   3.20000000e+01],
       [  4.59000000e-02,   5.25000000e+01,   5.32000000e+00, ...,
          3.96900000e+02,   7.60000000e+00,   2.23000000e+01],
       ..., 
       [  1.70040000e-01,   1.25000000e+01,   7.87000000e+00, ...,
          3.86710000e+02,   1.71000000e+01,   1.89000000e+01],
       [  7.85700000e-01,   2.00000000e+01,   3.97000000e+00, ...,
          3.84070000e+02,   1.47900000e+01,   3.07000000e+01],
       [  1.95100000e-02,   1.75000000e+01,   1.38000000e+00, ...,
          3.93240000e+02,   8.05000000e+00,   3.30000000e+01]])

In [14]:
#normalize the data
from sklearn import preprocessing
X_orig = train_data[:, [5, 7, 12]]
Y_orig = train_data[:, [13]]
X = preprocessing.scale(X_orig)
Y = preprocessing.scale(Y_orig)

X_test = preprocessing.scale(test_data[:, [5, 7, 12]])
Y_test = preprocessing.scale(test_data[:, [13]])




In [17]:
#run an OLS regression on room, distance, and lower status
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X, Y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
#see our results
from sklearn.metrics import mean_squared_error
RMSE1 = mean_squared_error(regr.predict(X_test), Y_test)**0.5
print(RMSE1)


0.568269552588


In [35]:
#run a decision tree regression on room, distance, lower status, crime,
#blacks, radius, lower status, and tax
X_original = train_data[:, [0, 5, 7, 8, 10, 11, 12]]
Y_original = train_data[:, [13]]
X = preprocessing.scale(X_original)
Y = preprocessing.scale(Y_original)

X_test = preprocessing.scale(test_data[:, [0, 5, 7, 8, 10, 11, 12]])
Y_test = preprocessing.scale(test_data[:, [13]])

from sklearn import tree
clf = tree.DecisionTreeRegressor()
clf = clf.fit(X, Y)


In [45]:
#see our results
predics = clf.predict(X_test)
RMSE2 = mean_squared_error(predics, Y_test)**0.5
print(RMSE2)

2.55510784169e-18
