In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input/kc-house-dataset"))

# Any results you write to the current directory are saved as output.

In [None]:
evaluation = pd.DataFrame({"Model":[],"Details":[],"RMSE":[],"R2-train":[],"Adj-R2-train":[],"R2-test":[],"Adj-R2-test":[]})

In [None]:
evaluation

In [None]:
dataset = pd.read_csv("../input/kc-house-dataset/kc_house_data.csv")
dataset.head()

In [None]:
# Differentiate numerical features (minus the target) and categorical features
print("Numerical features : " + str(len(dataset.select_dtypes(exclude = ["object"]).columns)))
print("Categorical features : " + str(len(dataset.select_dtypes(include = ["object"]).columns)))

In [None]:
def adjustedR2(r2,n,k):
    return r2 - (k - 1)/(n - k)*(1 - r2)

In [None]:
features=[]
for i in dataset.columns:
    features.append(i)

In [None]:
dataset[features].corr().head()

In [None]:
# Lets have Simple Linear Regression B/W sqft_living' and 'price'
train_data,test_data = train_test_split(dataset,test_size=0.2,random_state=42)
X_train = train_data["sqft_living"].values.reshape(-1,1)
y_train = train_data[["price"]].values.reshape(-1,1)
X_test = test_data["sqft_living"].values.reshape(-1,1)
y_test = test_data["price"].values.reshape(-1,1)

linreg = LinearRegression()
linreg.fit(X_train,y_train)
y_pred = linreg.predict(X_test)

In [None]:
from sklearn import metrics
mse = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
R2_test = metrics.r2_score(y_test,y_pred)
Adj_R2_test = adjustedR2(R2_test,X_test.shape[0],X_test.shape[1])

In [None]:
evaluation=evaluation.append({"Model":"Simple Linear","Details":"","RMSE":mse, "R2-train":"","Adj-R2-train":"","R2-test":R2_test,"Adj-R2-test":Adj_R2_test},ignore_index=True)
#evaluation.drop(evaluation.index,inplace=True)

In [None]:
evaluation

In [None]:
plt.scatter(X_test,y_test,color='darkgreen',label="Data", alpha=.1)
plt.plot(X_test,linreg.predict(X_test),color="red",label="Predicted Regression Line")

In [None]:
# Lets have a multivariate Linear Regression

In [None]:
correlations=dataset[features].corr()
mask = np.zeros_like(correlations, dtype=np.bool)
mask[np.triu_indices_from(mask)]=True
fig,ax = plt.subplots(figsize=(20,20))
plt.title("Pearson Correlation Matrix", fontsize=25)
sns.heatmap(correlations,vmax=1,square=True,annot=True,mask=mask)

In [None]:
correlations.sort_values(["price"], ascending = False)

In [None]:
train_data,test_data = train_test_split(dataset,test_size=0.2,random_state=42)

In [None]:
# We will take only those independent variables which are corelated to the output variable "price"
features1=["sqft_living","grade","sqft_above","sqft_living15","bathrooms","view","sqft_basement","bedrooms","lat","waterfront","floors"]
linreg2 = LinearRegression()
X_train = train_data[features1].values
y_train = train_data[["price"]].values
X_test = test_data[features1].values
y_test = test_data[["price"]].values

linreg2.fit(X_train,y_train)
y_pred = linreg2.predict(X_test)

In [None]:
mse2 = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
R2_test2 = metrics.r2_score(y_test,y_pred)
Adj_R2_test2 = adjustedR2(R2_test2,X_test.shape[0],X_test.shape[1])
evaluation=evaluation.append({"Model":"Multivariate Linear","Details":"","RMSE":mse2, "R2-train":"","Adj-R2-train":"","R2-test":R2_test2,"Adj-R2-test":Adj_R2_test2},ignore_index=True)

In [None]:
evaluation

In [None]:
# Lets have a Polynomial Linear Regression
polyreg = PolynomialFeatures(degree = 2)
X_train = train_data[features1].values
y_train = train_data[["price"]].values
X_test = test_data[features1].values
y_test = test_data[["price"]].values

X_poly_train = polyreg.fit_transform(X_train) 
X_poly_test = polyreg.fit_transform(X_test)

In [None]:
X_train.shape

In [None]:
X_poly_train.shape

In [None]:
linreg2.fit(X_poly_train,y_train)
y_pred = linreg2.predict(X_poly_test)

In [None]:
mse3 = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
R2_test3 = metrics.r2_score(y_test,y_pred)
Adj_R2_test3 = adjustedR2(R2_test3,X_poly_test.shape[0],X_poly_test.shape[1])
evaluation=evaluation.append({"Model":"Polynomial Linear-2","Details":"","RMSE":mse3, "R2-train":"","Adj-R2-train":"","R2-test":R2_test3,"Adj-R2-test":Adj_R2_test3},ignore_index=True)

In [None]:
polyreg = PolynomialFeatures(degree = 3)
X_train = train_data[features1].values
y_train = train_data[["price"]].values
X_test = test_data[features1].values
y_test = test_data[["price"]].values

X_poly_train = polyreg.fit_transform(X_train) 
X_poly_test = polyreg.fit_transform(X_test)

linreg2.fit(X_poly_train,y_train)
y_pred = linreg2.predict(X_poly_test)

In [None]:
mse4 = np.sqrt(metrics.mean_squared_error(y_test,y_pred))
R2_test4 = metrics.r2_score(y_test,y_pred)
Adj_R2_test4 = adjustedR2(R2_test4,X_poly_test.shape[0],X_poly_test.shape[1])
evaluation=evaluation.append({"Model":"Polynomial Linear-3","Details":"","RMSE":mse4, "R2-train":"","Adj-R2-train":"","R2-test":R2_test4,"Adj-R2-test":Adj_R2_test4},ignore_index=True)

In [None]:
evaluation