In [3]:
from sklearn import linear_model
from sklearn import model_selection
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import statsmodels.formula.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
import math
import reverse_geocoder as rgc
import pprint as pp

train_data = pd.read_csv("./data/kc_house_data.csv", sep = ",", header = 0)

#Start PreProcessing
#Changing the date format to get the year 
train_data['dateInFormat'] = train_data['date'].apply(lambda x: x[0:8])
train_data['dateInFormat'] = train_data['dateInFormat'].apply(lambda x: datetime.strptime(x, '%Y%m%d'))
#The year in which the propert was sold
train_data['YearSold'] = train_data['dateInFormat'].apply(lambda x: int(x.year))

#Since the value of Built Year and Selling Year is large, its good to normalize the value of different variables to similar range

#We derive the Time since its built
train_data['HomeAgeinYear'] = train_data['YearSold'] - train_data['yr_built']

#We derive the Time after which it was renovated
train_data['RenovatedafterYears'] = train_data['yr_renovated'] - train_data['yr_built']

#Creating WaterFront as a dummy variable
train_data['IsRenovated'] = train_data['waterfront'].apply(lambda x: 1 if x > 0 else -1)


#pd.plotting.scatter_matrix(train_data.loc[:,'price':'floors'], alpha = 0.2, figsize = (20, 20), diagonal = 'kde')
#plt.show()


#Lot of variables are not specifically normally distributed. Therefore transforming them using log(Base e) transform
train_data['Log_sqftlot'] = train_data['sqft_lot'].apply(lambda x: np.log(x))
train_data['Log_price'] = train_data['price'].apply(lambda x: np.log(x))
train_data['Log_sqftlot15'] = train_data['sqft_lot15'].apply(lambda x: np.log(x))
train_data['Log_sqftLiving'] = train_data['sqft_living'].apply(lambda x: np.log(x))

#GeoLocation to be added into the picture. Try to convert Latitude and Longitude to location names
for i in range(0, len(train_data['lat'])):
	coordiantes = (train_data.loc[i,'lat'], train_data.loc[i,'long'])
	train_data.loc[i, 'Location'] = rgc.search(coordiantes, mode = 1)[0].get('name')


mappingDictionary = {}

for k in train_data.Location.unique():
	#print(train_data[train_data['Location'] == k].price.mean())
	mappingDictionary[k] = train_data[train_data['Location'] == k].price.mean()

train_data['LocationMapping'] = train_data['Location'].apply(lambda x: mappingDictionary.get(x))

meanOfLocation = train_data['LocationMapping'].mean()
standardDeviationLocation = train_data['LocationMapping'].std()

train_data['LocationMapping'] = train_data['LocationMapping'].apply(lambda x: ((x-meanOfLocation)/standardDeviationLocation))

train_data['LivingSpaceAvailable'] = train_data['sqft_living']/train_data['sqft_lot']
train_data['NeighbourSpace'] = train_data['sqft_living15']/train_data['sqft_lot15']
train_data['Log_LivingSpaceAvailable'] = train_data['LivingSpaceAvailable'].apply(lambda x: np.log(x))
train_data['Log_NeighbourSpace'] = train_data['NeighbourSpace'].apply(lambda x: np.log(x))
#np.savetxt('File.csv', train_data['NeighbourhoodLiving'])

#The Value of VIF tells that there is a collinearity between the Living space and above space. Assuming that both will be same if basement is not there.
#Therefore removing basement values and converting it into a variable to express the presence or absence of it
train_data['IsBasementThere'] = train_data['sqft_above'].apply(lambda x: 1 if x >= 1 else 0)
#plt.hist(train_data['sqft_lot'], color = "red")
#plt.hist(train_data['grade'], color = "skyblue")
#plt.show()


#Check the correlation matrix with price (the below command only works with jupyter notebook or other software having HTML support)
train_data.corr().loc[:,['price','Log_price']].style.background_gradient(cmap='coolwarm', axis=None)


AttributeError: 'Series' object has no attribute 'style'