# Example adapted to use Pandas and Scikit-learn

In [1]:
import pandas as pd

In [2]:
data_set = pd.read_csv('home_data.csv', index_col=0)

In [3]:
data_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21613 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
date             21613 non-null object
price            21613 non-null int64
bedrooms         21613 non-null int64
bathrooms        21613 non-null float64
sqft_living      21613 non-null int64
sqft_lot         21613 non-null int64
floors           21613 non-null float64
waterfront       21613 non-null int64
view             21613 non-null int64
condition        21613 non-null int64
grade            21613 non-null int64
sqft_above       21613 non-null int64
sqft_basement    21613 non-null int64
yr_built         21613 non-null int64
yr_renovated     21613 non-null int64
zipcode          21613 non-null int64
lat              21613 non-null float64
long             21613 non-null float64
sqft_living15    21613 non-null int64
sqft_lot15       21613 non-null int64
dtypes: float64(4), int64(15), object(1)
memory usage: 3.5+ MB


In [4]:
target_column = 'price'
X = data_set[data_set.columns.drop(target_column)].values
Y = data_set[target_column].values

In [5]:
data_set.head(1)

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7129300520,20141013T000000,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650


In [6]:
# get mean of price for selected zip code
dataset_sales_for_zip_code = data_set.loc[data_set['zipcode'] == 98039]
print(dataset_sales_for_zip_code.info())
print(dataset_sales_for_zip_code['price'].mean())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 3625049014 to 3262300818
Data columns (total 20 columns):
date             50 non-null object
price            50 non-null int64
bedrooms         50 non-null int64
bathrooms        50 non-null float64
sqft_living      50 non-null int64
sqft_lot         50 non-null int64
floors           50 non-null float64
waterfront       50 non-null int64
view             50 non-null int64
condition        50 non-null int64
grade            50 non-null int64
sqft_above       50 non-null int64
sqft_basement    50 non-null int64
yr_built         50 non-null int64
yr_renovated     50 non-null int64
zipcode          50 non-null int64
lat              50 non-null float64
long             50 non-null float64
sqft_living15    50 non-null int64
sqft_lot15       50 non-null int64
dtypes: float64(4), int64(15), object(1)
memory usage: 8.2+ KB
None
2160606.6


In [7]:
data_set_for_sqft_living = data_set.loc[(data_set['sqft_living']> 2000) & (data_set['sqft_living'] <= 4000)]
data_set_for_sqft_living.shape

(9118, 20)

In [8]:
# What fraction of the all houses have ‘sqft_living’ in this range?
float(data_set_for_sqft_living.shape[0]) / data_set.shape[0]

0.42187572294452413

In [9]:
# split data
from sklearn import model_selection
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from math import sqrt

validation_size = 0.20
seed = 0

my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']

advanced_features = [
'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
'condition', # condition of house
'grade', # measure of quality of construction
'waterfront', # waterfront property
'view', # type of view
'sqft_above', # square feet above ground
'sqft_basement', # square feet in basement
'yr_built', # the year built
'yr_renovated', # the year renovated
'lat', 'long', # the lat-long of the parcel
'sqft_living15', # average sq.ft. of 15 nearest neighbors
'sqft_lot15', # average lot size of 15 nearest neighbors 
]

# Compute the RMSE (root mean squared error) on the test_data for the model using just my_features 
my_features_ds = data_set[my_features]
print(my_features_ds.shape)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(my_features_ds, Y, test_size=validation_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)

# Make predictions using the testing set
Y_pred = regr.predict(X_test)

print("Root mean squared error: %.2f" % sqrt(mean_squared_error(Y_test, Y_pred)))

# Compute the RMSE (root mean squared error) on the test_data for the model using advanced_features.
my_features_ds = data_set[advanced_features]
print(my_features_ds.shape)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(my_features_ds, Y, test_size=validation_size, random_state=seed)
print(X_train.shape)
print(X_test.shape)

regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train, Y_train)

# Make predictions using the testing set
Y_pred = regr.predict(X_test)

print("Root mean squared error: %.2f" % sqrt(mean_squared_error(Y_test, Y_pred)))

(21613, 6)
(17290, 6)
(4323, 6)
Root mean squared error: 244004.77
(21613, 18)
(17290, 18)
(4323, 18)
Root mean squared error: 190473.38
