In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
import statsmodels.api as sm
from sklearn.dummy import DummyRegressor

pd.set_option('display.max_rows', 100) # Allows Jupyter Notebook to expand how much data is shown.

In [None]:
df = pd.read_csv('data/kc_house_data.csv')
df.info()

In [None]:
df.describe().apply(lambda s: s.apply(lambda x: format(x, 'f')))

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(15,10))
axs[0, 0].scatter(df.sqft_living, df.price)
axs[0, 0].set_title('sqft_living')
axs[0, 1].scatter(df.sqft_above, df.price)
axs[0, 1].set_title('sqft_above')
axs[1, 0].scatter(df.sqft_living15, df.price)
axs[1, 0].set_title('sqft_living15')
axs[1, 1].scatter(df.bathrooms, df.price)
axs[1, 1].set_title('bathrooms')
axs[2, 0].scatter(df.bedrooms, df.price)
axs[2, 0].set_title('bedrooms')
axs[2, 1].scatter(df.lat, df.price)
axs[2, 1].set_title('lat')
fig.tight_layout();

In [None]:
df.corr()

In [None]:
price_corr = df.corr()['price'].map(abs).sort_values(ascending=False)
price_corr

In [None]:
sns.heatmap(df.corr(),center=0);

In [None]:
pd.plotting.scatter_matrix(df,figsize=(12,12));

# Data cleaning

In [None]:
def determine_dupes(series):
    series_vcs = pd.Series(series.value_counts())
    series_dupes = [series_vcs.index[index] for index in range(len(series_vcs)) if series_vcs.values[index] > 1]
    print("Amount of unique duplicates: " + str(len(series_dupes)))
    print("Total amount of duplicates: " + str(series_vcs.values[0:len(series_dupes)].sum()))
    
    return series_vcs

In [None]:
determine_dupes(df.id)

In [None]:
df = df.drop_duplicates(subset=['id'], keep='last')
df.info()

In [None]:
df.drop(df.loc[df['bedrooms']==33].index, inplace=True)
df.drop(df.loc[df['bedrooms']==11].index, inplace=True)
df.drop(df.loc[df['bedrooms']==10].index, inplace=True)
df.drop(df.loc[df['bedrooms']==9].index, inplace=True)

df.sort_values('bedrooms', ascending=False).head(10)

In [None]:
df.yr_renovated = df.yr_renovated.fillna(0)
df.yr_renovated = df.yr_renovated.astype('int64')

df.view = df.view.fillna('NONE')

df.waterfront = df.waterfront.fillna('NO')

df.loc[df.sqft_basement == '?', 'sqft_basement'] = 0.0
df.sqft_basement = df.sqft_basement.astype('float64').astype('int64')

# df.grade = pd.to_numeric(df.grade.map(lambda x: x.split()[0]))

In [None]:
df.info()

# Looking at distributions

In [None]:
sns.displot(df['price'],kde=True);

As we can see the distribution of price is not normal.

In [None]:
sns.displot(df['price'].apply(lambda x: np.log(x)),kde=True);

Applying a log transformation to the price column will make the distribution normal.

In [None]:
sns.displot(df['bedrooms'],kde=True)
sns.displot(df['bathrooms'],kde=True)
sns.displot(df['floors'],kde=True)
sns.displot(df['sqft_living'],kde=True)
sns.displot(df['sqft_living'].apply(lambda x: np.log(x)),kde=True);

# Trying a model

In [None]:
feature_cols = ['bedrooms', 'bathrooms','floors','sqft_living']
features = df[feature_cols]
target = df['price']

In [None]:
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(features, target)
dummy_regr.score(features,target)

In [None]:
features = sm.add_constant(features)
model = sm.OLS(target,features).fit()
model.summary()

Floors has a large p-value which makes this feature not significant for this model.

# Mapping Houses

In [None]:
import folium

Create a sample of houses to mark on the map. Since they are many houses in the original df, marking all of them will be very hard to load. Just making sure the marking works.

In [None]:
sample = df.sample(20,random_state=33)

In [None]:
mp = folium.Map(location=[sample.lat.mean(),
                          sample.long.mean()], zoom_start=10, control_scale=True)
mp

In [None]:
for index, location_info in sample.iterrows():
    folium.Marker([location_info["lat"], location_info["long"]],
                  popup="$" + str(location_info["price"])).add_to(mp)

In [None]:
mp