In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import cross_val_score
from mpl_toolkits.mplot3d import Axes3D
import folium
from folium.plugins import HeatMap
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import math
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import sklearn
from time import time
from sklearn.metrics import r2_score
import os
from sklearn.preprocessing import scale
#from scipy import stats
from statsmodels.stats import weightstats as mm


In [None]:
df = pd.read_csv('kc_house_data.csv')
df.head()

In [None]:
df.describe(include=[np.number])

In [None]:
df.isnull().sum()  #Data not having any NaNs

In [None]:
data=df
features = ['price','bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront',
            'view','condition','grade','sqft_above','sqft_basement','yr_built','yr_renovated',
            'zipcode','lat','long','sqft_living15','sqft_lot15']

mask = np.zeros_like(data[features].corr(), dtype=np.bool) 
mask[np.triu_indices_from(mask)] = True 

f, ax = plt.subplots(figsize=(16, 12))
plt.title('Pearson Correlation Matrix',fontsize=25)

sns.heatmap(data[features].corr(),linewidths=0.25,vmax=0.7,square=True,cmap="BuGn", #"BuGn_r" to reverse 
            linecolor='w',annot=True,annot_kws={"size":8},mask=mask,cbar_kws={"shrink": .9});

In [None]:
sns.regplot(x='sqft_living',y='price',data=data)

In [None]:
sns.regplot(x='sqft_above',y='price',data=data)

In [None]:
sns.regplot(x='sqft_basement',y='price',data=data)

In [None]:
sns.stripplot(x='bedrooms', y='price',data=data)

In [None]:
sns.stripplot(x='bathrooms', y='price',data=data)

In [None]:
sns.stripplot(x='grade', y='price',data=data, size=5)

In [None]:
# find the row of the house which has the highest price
maxpr=data.loc[data['price'].idxmax()]

# define a function to draw a basemap easily
def generateBaseMap(default_location=[47.5112, -122.257], default_zoom_start=9.4):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

df_copy = data.copy()
# select a zipcode for the heatmap
#set(df['zipcode'])
#df_copy = df[df['zipcode']==98001].copy()
df_copy['count'] = 1
basemap = generateBaseMap()
# add carton position map
folium.TileLayer('cartodbpositron').add_to(basemap)
s=folium.FeatureGroup(name='icon').add_to(basemap)
# add a marker for the house which has the highest price
folium.Marker([maxpr['lat'], maxpr['long']],popup='Highest Price: $'+str(format(maxpr['price'],'.0f')),
              icon=folium.Icon(color='green')).add_to(s)
# add heatmap
HeatMap(data=df_copy[['lat','long','count']].groupby(['lat','long']).sum().reset_index().values.tolist(),
        radius=8,max_zoom=13,name='Heat Map').add_to(basemap)
folium.LayerControl(collapsed=False).add_to(basemap)
basemap

In [None]:
c=['bedrooms','bathrooms','sqft_living','sqft_above','grade']
df=data[c]

In [None]:
df=pd.get_dummies(df,columns=['grade'], drop_first=True)

In [None]:
y=data['price']

In [None]:
x_train,y_train=train_test_split(data,train_size=0.8,random_state=42)

In [None]:
x_train.head()

In [None]:
var = 'floors'
data = pd.concat([df_train['price'], df_train[var]], axis=1)
f, ax = plt.subplots(figsize=(20, 20))
fig = sns.boxplot(x=var, y="price", data=data)
fig.axis(ymin=0, ymax=3500000);

In [None]:
df = pd.read_csv('kc_house_data.csv')
data=df
train_data,test_data = train_test_split(data,train_size = 0.8,random_state=3)

lr = linear_model.LinearRegression()
x_train = np.array(train_data['sqft_living'], dtype=pd.Series).reshape(-1,1)
y_train = np.array(train_data['price'], dtype=pd.Series)
lr.fit(x_train,y_train)

In [None]:
print ("Average Price for Test Data: {:.3f}".format(y_test.mean()))
print('Intercept: {}'.format(lr.intercept_))
print('Coefficient: {}'.format(lr.coef_))

In [None]:
x_test = np.array(test_data['sqft_living'], dtype=pd.Series).reshape(-1,1)
y_test = np.array(test_data['price'], dtype=pd.Series)
pred0 = lr.predict(x_test)

In [None]:
x_test1 = np.squeeze(np.asarray(x_test))
y_test1= np.squeeze(np.asarray(y_test))
x_train1 = np.squeeze(np.asarray(x_train))
y_train1= np.squeeze(np.asarray(y_train))

In [None]:
print(metrics.mean_squared_error(y_test, lr.predict(x_test)))
exp_est0 = sklearn.metrics.explained_variance_score(pred0,y_test)
print(exp_est0)
rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred)),'.3f'))
rtrsm = float(format(lr.score(x_train, y_train),'.3f'))
rtesm = float(format(lr.score(x_test, y_test),'.3f'))
cv = float(format(cross_val_score(lr,data[['sqft_living']],data['price'],cv=5).mean(),'.3f'))
print(rmsesm)

In [None]:
lr.score(x_test,y_test)

In [None]:
n=len(data["sqft_living"])
mean=data["sqft_living"].mean()
sd=data["sqft_living"].std()
(n,mean,sd)

In [None]:
ztest,pvals=mm.ztest(data["sqft_living"],value=2060,alternative="larger")
#value is the null hypothesis; larger is for alternate hypothesis: means prop > value.
print("p-value",pvals)

In [None]:
if pvals<=0.05:
    print("We reject null hypothesis.")
else:
    print("We fail to reject the null hypothesis.")

In [None]:
print(data.shape)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
start = time()
est=GradientBoostingRegressor(n_estimators=400, max_depth=5, loss='ls',min_samples_split=2,learning_rate=0.1).fit(x_train, y_train)
end=time()
train_time_g=end-start
gradient=est.score(x_test,y_test)

pred1 = est.predict(x_test)
exp_est1 = sklearn.metrics.explained_variance_score(pred1,y_test)

In [None]:
print('Score: {}'.format(gradient))
print('Variance Score: {}'.format(exp_est1))
rmsesm = float(format(np.sqrt(metrics.mean_squared_error(y_test,pred1)),'.3f'))
rtrsm = float(format(est.score(x_train, y_train),'.3f'))
rtesm = float(format(est.score(x_test, y_test),'.3f'))
cv = float(format(cross_val_score(est,data[['sqft_living']],data['price'],cv=5).mean(),'.3f'))
print(rmsesm)

In [None]:
def adjustedR2(r2,n,k):
    return r2-(k-1)/(n-k)*(1-r2)
df_dm=df.copy()
df_dm.describe()

In [None]:
from sklearn.linear_model import Lasso

In [None]:
features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','waterfront',
            'view','condition','grade','sqft_above','sqft_basement',
            'zipcode','lat','long','sqft_living15','sqft_lot15']
df = pd.read_csv('kc_house_data.csv')
data=df
train_data_dm,test_data_dm = train_test_split(data,train_size = 0.8,random_state=3)
def adjustedR2(r2,n,k):
    return r2-(k-1)/(n-k)*(1-r2)
df_dm=df.copy()
df_dm.describe()
complex_model_L = linear_model.Lasso(alpha=1)
complex_model_L.fit(train_data_dm[features],train_data_dm['price'])

pred1 = complex_model_L.predict(test_data_dm[features])
rmsecm1 = float(format(np.sqrt(metrics.mean_squared_error(test_data_dm['price'],pred1)),'.3f'))
rtrcm1 = float(format(complex_model_L.score(train_data_dm[features],train_data_dm['price']),'.3f'))
artrcm1 = float(format(adjustedR2(complex_model_L.score(train_data_dm[features],train_data_dm['price']),train_data_dm.shape[0],len(features)),'.3f'))
rtecm1 = float(format(complex_model_L.score(test_data_dm[features],test_data_dm['price']),'.3f'))
artecm1 = float(format(adjustedR2(complex_model_L.score(test_data_dm[features],test_data_dm['price']),test_data_dm.shape[0],len(features)),'.3f'))
cv1 = float(format(cross_val_score(complex_model_L,df_dm[features],df_dm['price'],cv=5).mean(),'.3f'))

complex_model_L = linear_model.Lasso(alpha=100)
complex_model_L.fit(train_data_dm[features],train_data_dm['price'])

pred2 = complex_model_L.predict(test_data_dm[features])
rmsecm2 = float(format(np.sqrt(metrics.mean_squared_error(test_data_dm['price'],pred2)),'.3f'))
rtrcm2 = float(format(complex_model_L.score(train_data_dm[features],train_data_dm['price']),'.3f'))
artrcm2 = float(format(adjustedR2(complex_model_L.score(train_data_dm[features],train_data_dm['price']),train_data_dm.shape[0],len(features)),'.3f'))
rtecm2 = float(format(complex_model_L.score(test_data_dm[features],test_data_dm['price']),'.3f'))
artecm2 = float(format(adjustedR2(complex_model_L.score(test_data_dm[features],test_data_dm['price']),test_data_dm.shape[0],len(features)),'.3f'))
cv2 = float(format(cross_val_score(complex_model_L,df_dm[features],df_dm['price'],cv=5).mean(),'.3f'))

complex_model_L = linear_model.Lasso(alpha=1000)
complex_model_L.fit(train_data_dm[features],train_data_dm['price'])

pred3 = complex_model_L.predict(test_data_dm[features])
rmsecm3 = float(format(np.sqrt(metrics.mean_squared_error(test_data_dm['price'],pred3)),'.3f'))
rtrcm3 = float(format(complex_model_L.score(train_data_dm[features],train_data_dm['price']),'.3f'))
artrcm3 = float(format(adjustedR2(complex_model_L.score(train_data_dm[features],train_data_dm['price']),train_data_dm.shape[0],len(features)),'.3f'))
rtecm3 = float(format(complex_model_L.score(test_data_dm[features],test_data_dm['price']),'.3f'))
artecm3 = float(format(adjustedR2(complex_model_L.score(test_data_dm[features],test_data_dm['price']),test_data_dm.shape[0],len(features)),'.3f'))
cv3 = float(format(cross_val_score(complex_model_L,df_dm[features],df_dm['price'],cv=5).mean(),'.3f'))

r = evaluation.shape[0]
evaluation.loc[r] = ['Lasso Regression','alpha=1, all features',rmsecm1,rtrcm1,artrcm1,rtecm1,artecm1,cv1]
evaluation.loc[r+1] = ['Lasso Regression','alpha=100, all features',rmsecm2,rtrcm2,artrcm2,rtecm2,artecm2,cv2]
evaluation.loc[r+2] = ['Lasso Regression','alpha=1000, all features',rmsecm3,rtrcm3,artrcm3,rtecm3,artecm3,cv3]
evaluation.sort_values(by = '5-Fold Cross Validation', ascending=False)


In [None]:
#MLR 1
train_data_dm,test_data_dm = train_test_split(data,train_size = 0.8,random_state=3)

features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','zipcode']
complex_model_1 = linear_model.LinearRegression()
complex_model_1.fit(train_data_dm[features],train_data_dm['price'])

print('Intercept: {}'.format(complex_model_1.intercept_))
print('Coefficients: {}'.format(complex_model_1.coef_))

pred31 = complex_model_1.predict(test_data_dm[features])
print(pred31)
exp_est31 = sklearn.metrics.explained_variance_score(pred31,y_test)
print(exp_est31)
rmsesm = float(np.sqrt(metrics.mean_squared_error(y_test,pred31)))
print(rmsesm)
rmsecm = float(format(np.sqrt(metrics.mean_squared_error(test_data_dm['price'],pred31)),'.3f'))
rtrcm = float(format(complex_model_1.score(train_data_dm[features],train_data_dm['price']),'.3f'))
artrcm = float(format(adjustedR2(complex_model_1.score(train_data_dm[features],train_data_dm['price']),train_data_dm.shape[0],len(features)),'.3f'))
rtecm = float(format(complex_model_1.score(test_data_dm[features],test_data_dm['price']),'.3f'))
artecm = float(format(adjustedR2(complex_model_1.score(test_data_dm[features],test_data['price']),test_data_dm.shape[0],len(features)),'.3f'))
cv = float(format(cross_val_score(complex_model_1,df_dm[features],df_dm['price'],cv=5).mean(),'.3f'))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multiple Regression-1','selected features',rmsecm,rtrcm,artrcm,rtecm,artecm,cv]
evaluation.sort_values(by = '5-Fold Cross Validation', ascending=False)

In [None]:
#MLR 2
train_data_dm,test_data_dm = train_test_split(data,train_size = 0.8,random_state=3)
features = ['bedrooms','bathrooms','sqft_living','sqft_lot','floors','view','grade','zipcode','sqft_living15','sqft_lot15']
complex_model_2 = linear_model.LinearRegression()
complex_model_2.fit(train_data_dm[features],train_data_dm['price'])

print('Intercept: {}'.format(complex_model_2.intercept_))
print('Coefficients: {}'.format(complex_model_2.coef_))

pred = complex_model_2.predict(test_data_dm[features])
rmsecm = float(format(np.sqrt(metrics.mean_squared_error(test_data_dm['price'],pred)),'.3f'))
rtrcm = float(format(complex_model_2.score(train_data_dm[features],train_data_dm['price']),'.3f'))
artrcm = float(format(adjustedR2(complex_model_2.score(train_data_dm[features],train_data_dm['price']),train_data_dm.shape[0],len(features)),'.3f'))
rtecm = float(format(complex_model_2.score(test_data_dm[features],test_data_dm['price']),'.3f'))
artecm = float(format(adjustedR2(complex_model_2.score(test_data_dm[features],test_data_dm['price']),test_data_dm.shape[0],len(features)),'.3f'))
cv = float(format(cross_val_score(complex_model_2,df_dm[features],df_dm['price'],cv=5).mean(),'.3f'))

r = evaluation.shape[0]
evaluation.loc[r] = ['Multiple Regression-2','selected features',rmsecm,rtrcm,artrcm,rtecm,artecm,cv]
evaluation.sort_values(by = '5-Fold Cross Validation', ascending=False)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbm_model = GradientBoostingRegressor(max_depth=7,random_state=42)
gbm_model.fit(x_train, y_train)
y_pred = gbm_model.predict(x_test)
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print(r2_score(y_test,y_pred))
y_tr_pred=gbm_model.predict(x_train)
print(r2_score(y_train,y_tr_pred))
print(r2_score(y_test,y_pred))
adjustedR2(r2_score(y_test,y_pred),df_dm.shape[0],df_dm.shape[1])

In [None]:
rmsecm = float(format(np.sqrt(metrics.mean_squared_error(test_data_dm['price'],y_pred)),'.3f'))
rtrcm = float(format(complex_model_2.score(train_data_dm[features],train_data_dm['price']),'.3f'))
artrcm = float(format(adjustedR2(complex_model_2.score(train_data_dm[features],train_data_dm['price']),train_data_dm.shape[0],len(features)),'.3f'))
rtecm = float(format(complex_model_2.score(test_data_dm[features],test_data_dm['price']),'.3f'))
artecm = float(format(adjustedR2(complex_model_2.score(test_data_dm[features],test_data_dm['price']),test_data_dm.shape[0],len(features)),'.3f'))
cv = float(format(cross_val_score(complex_model_2,df_dm[features],df_dm['price'],cv=5).mean(),'.3f'))

r = evaluation.shape[0]
evaluation.loc[r] = ['Gradient-Boosting','selected features',rmsecm,rtrcm,artrcm,rtecm,artecm,cv]
evaluation.sort_values(by = '5-Fold Cross Validation', ascending=False)