In [1]:
%matplotlib notebook
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler

In [2]:
redfin = pd.read_csv('complete_redfin.csv')

In [3]:
redfin.head()

Unnamed: 0,SOLD DATE,PROPERTY TYPE,ADDRESS,CITY,STATE OR PROVINCE,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,LOCATION,...,patio,hasPool,taxdue,frontFaceDirection,yearRemodled,propertyHistory,schoolserving,walkscore,bikescore,transitscore
0,2018-01-12,Condo/Co-op,960 E 100 S #C3,Salt Lake City,UT,84102.0,220000,2.0,2.0,Salt Lake City; So. Salt Lake,...,,['0'],"['$1,286']",['N'],2000.0,"[('Sold (MLS)', 1515744000000), ('Contingent',...","[('Wasatch School', 'K to 6', 7), ('Bryant Mid...",70.0,76.0,58.0
1,2019-02-16,Condo/Co-op,241 N Vine St #1202W,Salt Lake City,UT,84103.0,400000,2.0,2.0,Salt Lake City: Avenues Area,...,['1'],['1'],"['$2,303']",,,"[('Delisted', 1553497200000), ('Listed', 15504...","[('Washington School', 'Preschool to 6', 5), (...",79.0,68.0,71.0
2,2019-07-05,Single Family Residential,351 E Ramona Ave S,Salt Lake City,UT,84115.0,380500,2.0,1.0,Salt Lake City; So. Salt Lake,...,,,,,,[],[],58.0,83.0,57.0
3,2018-12-17,Single Family Residential,1272 N Catherine St,Salt Lake City,UT,84116.0,260000,3.0,1.0,Salt Lake City; Rose Park,...,,,,,,[],[],45.0,70.0,36.0
4,2019-06-13,Single Family Residential,370 N Chicago St,Salt Lake City,UT,84116.0,325000,3.0,1.0,Salt Lake City; Rose Park,...,['1'],['0'],"['$1,175']",['W'],1988.0,"[('Sold (MLS)', 1560409200000), ('Pending', 15...","[('Jackson School', 'Preschool to 6', 3), ('Br...",53.0,83.0,51.0


In [4]:
redfin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 38 columns):
SOLD DATE             9994 non-null object
PROPERTY TYPE         10000 non-null object
ADDRESS               9945 non-null object
CITY                  10000 non-null object
STATE OR PROVINCE     10000 non-null object
ZIP OR POSTAL CODE    9999 non-null float64
PRICE                 10000 non-null int64
BEDS                  9902 non-null float64
BATHS                 9881 non-null float64
LOCATION              9940 non-null object
SQUARE FEET           9901 non-null float64
LOT SIZE              9627 non-null float64
YEAR BUILT            9901 non-null float64
$/SQUARE FEET         9901 non-null float64
HOA/MONTH             2331 non-null float64
STATUS                9994 non-null object
URL                   10000 non-null object
SOURCE                9994 non-null object
MLS#                  9994 non-null float64
LATITUDE              10000 non-null float64
LONGITUDE        

## What can I do to improve the data quality for future analysis

So far we have only used 12 out of 38 columns how can we use the other columns in our data.

Many of the columns are binary values and so we could encode this data into 0 or 1 and then we would be able to use these items in analysis of the home.

Binary Columns:
- hasPool - There are some null values in this so we may have to make the assumption they do not have a pool.

Categary Columns (more than two categories):
- Driveway
- patio

In [5]:
redfin.columns

Index(['SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE OR PROVINCE',
       'ZIP OR POSTAL CODE', 'PRICE', 'BEDS', 'BATHS', 'LOCATION',
       'SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', '$/SQUARE FEET', 'HOA/MONTH',
       'STATUS', 'URL', 'SOURCE', 'MLS#', 'LATITUDE', 'LONGITUDE', 'filename',
       'marketingRemark', 'propertyStyle', 'garageCap', 'parkingCap',
       'driveway', 'roof', 'patio', 'hasPool', 'taxdue', 'frontFaceDirection',
       'yearRemodled', 'propertyHistory', 'schoolserving', 'walkscore',
       'bikescore', 'transitscore'],
      dtype='object')

In [None]:
continous_cols = ['BEDS', 'BATHS', 'PRICE','SQUARE FEET', 'LOT SIZE', 'YEAR BUILT', '$/SQUARE FEET', 'HOA/MONTH',
                  'yearRemodled', 'walkscore','bikescore', 'transitscore']

In [13]:
redfin['garageCap']

0       ['1']
1       ['1']
2         NaN
3         NaN
4       ['1']
5       ['2']
6       ['1']
7         NaN
8         NaN
9         NaN
10        NaN
11        NaN
12      ['1']
13      ['2']
14        NaN
15      ['1']
16        NaN
17      ['1']
18        NaN
19      ['1']
20      ['2']
21      ['1']
22        NaN
23      ['2']
24      ['1']
25        NaN
26        NaN
27        NaN
28        NaN
29      ['2']
        ...  
9970    ['2']
9971      NaN
9972    ['2']
9973      NaN
9974      NaN
9975      NaN
9976    ['2']
9977      NaN
9978    ['1']
9979    ['1']
9980      NaN
9981      NaN
9982      NaN
9983      NaN
9984    ['2']
9985    ['1']
9986      NaN
9987    ['1']
9988      NaN
9989      NaN
9990    ['2']
9991      NaN
9992      NaN
9993      NaN
9994      NaN
9995      NaN
9996      NaN
9997      NaN
9998      NaN
9999      NaN
Name: garageCap, Length: 10000, dtype: object

In [10]:
redfin.describe()

Unnamed: 0,ZIP OR POSTAL CODE,PRICE,BEDS,BATHS,SQUARE FEET,LOT SIZE,YEAR BUILT,$/SQUARE FEET,HOA/MONTH,MLS#,LATITUDE,LONGITUDE,filename,yearRemodled,walkscore,bikescore,transitscore
count,9999.0,10000.0,9902.0,9881.0,9901.0,9627.0,9901.0,9901.0,2331.0,9994.0,10000.0,10000.0,10000.0,4091.0,9990.0,9990.0,9990.0
mean,84109.69687,388125.4,3.36528,1.988665,1999.613776,8385.466,1956.44238,202.836077,248.788503,1535191.0,40.72314,-111.883092,4999.5,1998.819115,53.069469,62.386186,41.086286
std,7.315977,238366.7,1.403236,0.93955,1090.987266,74191.85,30.813936,70.229641,153.63142,462327.7,0.044113,0.06616,2886.89568,7.110104,21.912648,21.325282,14.54922
min,84013.0,6000.0,0.0,0.5,120.0,435.0,1860.0,13.0,0.0,1249628.0,40.5949,-112.06989,0.0,1973.0,0.0,0.0,0.0
25%,84105.0,243000.0,2.0,1.5,1320.0,4356.0,1937.0,155.0,150.0,1458616.0,40.688959,-111.917247,2499.75,1994.0,40.0,49.0,35.0
50%,84108.0,329000.0,3.0,1.75,1793.0,6534.0,1955.0,194.0,205.0,1518199.0,40.730167,-111.870293,4999.5,1998.0,55.0,63.0,38.0
75%,84116.0,455000.0,4.0,2.5,2352.0,8276.0,1978.0,236.0,301.0,1572594.0,40.759169,-111.848681,7499.25,2003.0,69.0,79.0,46.75
max,84401.0,4600000.0,48.0,32.0,24108.0,6969600.0,2019.0,1242.0,1744.0,11907120.0,40.827154,-111.580563,9999.0,2019.0,96.0,105.0,80.0
