In [28]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
%matplotlib inline 
import seaborn as sns 

from scipy import stats

from sklearn.linear_model import LinearRegression 
from sklearn import neighbors 
from sklearn.metrics import mean_squared_error 
from sklearn import preprocessing 

In [16]:
df = pd.read_csv("data/kc_house_data.csv", parse_dates = ['date'])
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


## Clean and Drop Useless Columns

In [17]:
df.drop(['id', 'date'], axis = 1, inplace = True)

In [18]:
df['basement_present'] = df['sqft_basement'].apply(lambda x: 1 if x > 0 else 0)
df['renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else 0)

In [19]:
# ENCODE:
categorial_cols = ['floors', 'view', 'condition', 'grade']

for cc in categorial_cols:
    dummies = pd.get_dummies(df[cc], drop_first=False)
    dummies = dummies.add_prefix("{}#".format(cc))
    df.drop(cc, axis=1, inplace=True)
    df = df.join(dummies)

In [20]:
dummies_zipcodes = pd.get_dummies(df['zipcode'], drop_first=False)
dummies_zipcodes.reset_index(inplace=True)
dummies_zipcodes = dummies_zipcodes.add_prefix("{}#".format('zipcode'))
dummies_zipcodes = dummies_zipcodes[['zipcode#98004','zipcode#98102','zipcode#98109','zipcode#98112','zipcode#98039','zipcode#98040']]
df.drop('zipcode', axis=1, inplace=True)
df = df.join(dummies_zipcodes)

In [32]:
def linear_model(train, test, input_feature):
    LR = LinearRegression()
    LR.fit(train.as_matrix(columns = [input_feature]), train.as_matrix(columns = ['price']))
    RMSE = mean_squared_error(test.as_matrix(columns = ['price']), 
                              LR.predict(test.as_matrix(columns = [input_feature])))**0.5
    return RMSE, LR.intercept_[0], LR.coef_[0][0]

In [33]:
from sklearn.cross_validation import train_test_split
train_data, test_data = train_test_split(df, train_size = 0.8, random_state = 10)

In [76]:
RMSE, b, m = linear_model(train_data, test_data, 'sqft_living')
print "Squared Error: ",RMSE
print "---------------------------"
print "Linear Equation for Price with input x (sqft_living)"
print "y =",m,"*x + ",b

Squared Error:  268279.643883
---------------------------
Linear Equation for Price with input x (sqft_living)
y = 277.36412987 *x +  -36738.1773464


## That is just sqft, we want all of the important ones

In [87]:
def multiple_linear_model(train, test, input_features):
    LR = LinearRegression() 
    LR.fit(train.as_matrix(columns = input_features), train.as_matrix(columns = ['price'])) 
    RMSE = mean_squared_error(test.as_matrix(columns = ['price']), 
                              LR.predict(test.as_matrix(columns = input_features)))**0.5 
    print "Accuracy", LR.score(train.as_matrix(columns = input_features), train.as_matrix(columns = ['price']))
    return RMSE, LR.intercept_[0], LR.coef_

In [88]:
RSME, b, m = multiple_linear_model(train_data, test_data, ['sqft_living','bathrooms','bedrooms'])
print "Squared Error: ",RMSE
print "---------------------------"
print "Linear Equation for Price"
print "y = {}*x_sqft + {}*x_bathrooms + {}*x_bedrooms + {}".format(m[0][0],m[0][1],m[0][2],b)

Accuracy 0.503279922539
Squared Error:  268279.643883
---------------------------
Linear Equation for Price
y = 306.150905622*x_sqft + 7913.53847651*x_bathrooms + -57658.9010346*x_bedrooms + 81100.9596775


## Remember in Data and Visualization we saw Log was a good idea?

In [96]:
def multiple_linear_model(train, test, input_features):
    LR = LinearRegression() 
    LR.fit(train.as_matrix(columns = input_features), train.as_matrix(columns = ['price'])) 
    RMSE = mean_squared_error(test.as_matrix(columns = ['price']), 
                              LR.predict(test.as_matrix(columns = input_features)))**0.5 
    print "Accuracy", LR.score(train.as_matrix(columns = input_features), train.as_matrix(columns = ['price']))
    return RMSE, LR.intercept_[0], LR.coef_

In [100]:
train_data['log_sqft_living'] = np.log(train_data['sqft_living'])
test_data['log_sqft_living'] = np.log(test_data['sqft_living'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [97]:
train_data, test_data = train_test_split(df, train_size = 0.8, random_state = 10)

In [99]:
RSME, b, m = multiple_linear_model(train_data, test_data, ['log_sqft_living','bathrooms','bedrooms'])
print "Squared Error: ",RMSE
print "---------------------------"
print "Linear Equation for Price"
print "y = {}*x_sqft + {}*x_bathrooms + {}*x_bedrooms + {}".format(m[0][0],m[0][1],m[0][2],b)

Accuracy 0.396839833449
Squared Error:  268279.643883
---------------------------
Linear Equation for Price
y = 502679.012295*x_sqft + 68751.4974534*x_bathrooms + -52184.7190415*x_bedrooms + -3224824.65507


### Looks like its less accurate. Even then, we wouldn't know what features to do. Therefore, its still no clear indication.

In [101]:
train_data_2, validation_data = train_test_split(train_data, train_size = 0.75, random_state = 50)

In [102]:
def RMSE(train, validation, features, new_input):
    features_list = list(features)
    features_list.append(new_input)
    regr = LinearRegression() 
    regr.fit(train.as_matrix(columns = features_list), train.as_matrix(columns = ['price'])) # Train the model
    RMSE_train = mean_squared_error(train.as_matrix(columns = ['price']), 
                              regr.predict(train.as_matrix(columns = features_list)))**0.5 # Calculate the RMSE on train data
    RMSE_validation = mean_squared_error(validation.as_matrix(columns = ['price']), 
                              regr.predict(validation.as_matrix(columns = features_list)))**0.5 # Calculate the RMSE on train data
    return RMSE_train, RMSE_validation 

In [103]:
input_list = train_data_2.columns.values.tolist() # list of column name
input_list.remove('price')

# list of features included in the regression model and the calculated train and validation errors (RMSE)
regression_greedy_algorithm = pd.DataFrame(columns = ['feature', 'train_error', 'validation_error'])  
i = 0
temp_list = []

# a while loop going through all the features in the dataframe
while i < len(train_data_2.columns)-1:
    
    # a temporary dataframe to select the best feature at each iteration
    temp = pd.DataFrame(columns = ['feature', 'train_error', 'validation_error'])
    
    # a for loop to test all the remaining features
    for p in input_list:
        RMSE_train, RMSE_validation = RMSE(train_data_2, validation_data, temp_list, p)
        temp = temp.append({'feature':p, 'train_error':RMSE_train, 'validation_error':RMSE_validation}, ignore_index=True)
        
    temp = temp.sort_values('train_error') # select the best feature using train error
    best = temp.iloc[0,0]
    temp_list.append(best)
    regression_greedy_algorithm = regression_greedy_algorithm.append({'feature': best, 
                                                  'train_error': temp.iloc[0,1], 'validation_error': temp.iloc[0,2]}, 
                                                 ignore_index=True) # add the feature to the dataframe
    input_list.remove(best) # remove the best feature from the list of available features
    i += 1
regression_greedy_algorithm

Unnamed: 0,feature,train_error,validation_error
0,sqft_living,262112.035351,252447.351473
1,lat,242262.177402,235286.401617
2,view#4,231244.309267,224476.602437
3,zipcode#98004,220653.468648,212180.029909
4,log_sqft_living,212796.569119,209437.967966
5,zipcode#98039,206294.269455,202159.740256
6,zipcode#98112,200228.736263,198228.988734
7,zipcode#98040,195351.195442,192215.694289
8,view#0,191236.276936,188135.685469
9,waterfront,188060.884216,185621.255616


In [105]:
greedy_algo_features_list = regression_greedy_algorithm['feature'].tolist()[:24] # select the first 30 features
test_error, _, _ = multiple_linear_model(train_data_2, test_data, greedy_algo_features_list)
print ('test error (RMSE) is: %s' %test_error)

Accuracy 0.799274244453
test error (RMSE) is: 176685.879135


## That concludes this project.