In [13]:
import pandas as pd
import numpy as np
import seaborn as sns

pd.set_option('display.max_columns', None) 

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('seaborn')

from scipy.stats import kurtosis, skew
import scipy.stats as stats

from math import sqrt

import statsmodels.formula.api as smf

In [14]:
df = pd.read_csv("cleaned_data_kc_house_data")

In [15]:
df.bedrooms = df.bedrooms.astype('category')
df.bathrooms = df.bathrooms.astype('category')
df.floors = df.floors.astype('category')
df.waterfront = df.waterfront.astype('category')
df.view = df.view.astype('category')
df.condition = df.condition.astype('category')
df.grade = df.grade.astype('category')
df.zipcode = df.zipcode.astype('category')

df.date = df.date.astype('datetime64')

In [16]:
df.head(1)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,2014-10-13,221900,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650


In [5]:
col_names = df.describe().drop(['id', 'price' ],axis =1)
results = [['ind_var', 'r_squared', 'intercept', 'slope', 'p-value' ]]
for idx, val in enumerate(col_names):
    print ("Kings Count: price~" + val)
    print ("------------------------------")

    f = 'price~' + val
    model = smf.ols(formula=f, data=df).fit()
    X_new = pd.DataFrame({val: [df[val].min(), df[val].max()]});
    preds = model.predict(X_new)
    results.append([val, model.rsquared, model.params[0], model.params[1], model.pvalues[1] ])
    print(results[idx+1])

Kings Count: price~sqft_living
------------------------------
['sqft_living', 0.49280911924669446, -43268.512106262235, 280.48770861771254, 0.0]
Kings Count: price~sqft_lot
------------------------------
['sqft_lot', 0.007814472143692908, 528375.1502102641, 0.787376619727918, 7.585998491379787e-38]
Kings Count: price~sqft_above
------------------------------
['sqft_above', 0.36660773487327236, 60694.197534214334, 268.0562844835508, 0.0]
Kings Count: price~sqft_basement
------------------------------
['sqft_basement', 0.1043403360061641, 462160.52256332163, 268.0065023424773, 0.0]
Kings Count: price~yr_built
------------------------------
['yr_built', 0.0030084646048746944, -811835.7701554694, 685.9800363317556, 1.5935116991992317e-15]
Kings Count: price~yr_renovated
------------------------------
['yr_renovated', 0.01365299512004814, 532177.9010089242, 118.0378169199777, 5.5496510506427115e-65]
Kings Count: price~lat
------------------------------
['lat', 0.09465928356351017, -38186648

In [6]:
model.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.007
Model:,OLS,Adj. R-squared:,0.007
Method:,Least Squares,F-statistic:,148.1
Date:,"Tue, 05 Mar 2019",Prob (F-statistic):,5.82e-34
Time:,14:49:31,Log-Likelihood:,-299950.0
No. Observations:,21082,AIC:,599900.0
Df Residuals:,21080,BIC:,599900.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,5.259e+05,2780.074,189.160,0.000,5.2e+05,5.31e+05
sqft_lot15,1.1284,0.093,12.170,0.000,0.947,1.310

0,1,2,3
Omnibus:,18544.0,Durbin-Watson:,1.966
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1085844.562
Skew:,3.99,Prob(JB):,0.0
Kurtosis:,37.241,Cond. No.,33100.0


In [7]:
pd.DataFrame(results)

Unnamed: 0,0,1,2,3,4
0,ind_var,r_squared,intercept,slope,p-value
1,sqft_living,0.492809,-43268.5,280.488,0
2,sqft_lot,0.00781447,528375,0.787377,7.586e-38
3,sqft_above,0.366608,60694.2,268.056,0
4,sqft_basement,0.10434,462161,268.007,0
5,yr_built,0.00300846,-811836,685.98,1.59351e-15
6,yr_renovated,0.013653,532178,118.038,5.54965e-65
7,lat,0.0946593,-3.81866e+07,814268,0
8,long,0.000506781,7.7161e+06,58715.6,0.00107984
9,sqft_living15,0.343976,-83139.9,313.746,0


<br>
<br>
<br>
<br>
<br>
<br>
<font color = 'green'><font size = 5> bin and label encode zip codes

In [12]:
Auburn= pd.Series([98001,98002,98092,])
Bellevue= pd.Series([98004,98005,98006,98007,98008])
Black_Diamond= pd.Series([98010])
Bothell= pd.Series([98011])
Carnation= pd.Series([98014])
Duvall= pd.Series([98019])
Enumclaw= pd.Series([98022])
Fall_City= pd.Series([98024])
Federal_Way= pd.Series([98003,98023])
Issaquah= pd.Series([98027,98029])
Kenmore= pd.Series([98028])
Kent= pd.Series([98030,98031,98032,98042])
Kirkland= pd.Series([98033,98034])
Maple_Valley= pd.Series([98038])
Medina= pd.Series([98039])
Mercer_Island= pd.Series([98040])
North_Bend= pd.Series([98045])
Redmond= pd.Series([98052,98053])
Renton= pd.Series([98055,98056,98058,98059])
Sammamish= pd.Series([98074,98075])
Seattle= pd.Series([98102,98103,98105,98106,98107,98108,98109,98112,98115,98116,98117,98118,98119,98122,98125,98126,98133,98136,98144,98146,98148,98155,98166,98168,98177,98178,98188,98198,98199])
Snoqualmie= pd.Series([98065])
Vashon= pd.Series([98070])
Woodinville= pd.Series([98072,98077])

<font color = 'green'><font size = 5> use log values in model

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21082 entries, 0 to 21081
Data columns (total 21 columns):
id               21082 non-null int64
date             21082 non-null datetime64[ns]
price            21082 non-null int64
bedrooms         21082 non-null category
bathrooms        21082 non-null category
sqft_living      21082 non-null int64
sqft_lot         21082 non-null int64
floors           21082 non-null category
waterfront       21082 non-null category
view             21082 non-null category
condition        21082 non-null category
grade            21082 non-null category
sqft_above       21082 non-null int64
sqft_basement    21082 non-null int64
yr_built         21082 non-null int64
yr_renovated     21082 non-null int64
zipcode          21082 non-null category
lat              21082 non-null float64
long             21082 non-null float64
sqft_living15    21082 non-null int64
sqft_lot15       21082 non-null int64
dtypes: category(8), datetime64[ns](1), float64(2), int6

In [9]:
data_log = pd.DataFrame([])

In [10]:
data_log["sqft_living"] = np.log(df["sqft_living"])
data_log["sqft_lot"] = np.log(df["sqft_lot"])
data_log["sqft_above"] = np.log(df["sqft_above"])
data_log["sqft_basement"] = np.log(df["sqft_basement"])
data_log["sqft_living15"] = np.log(df["sqft_living15"])
data_log["sqft_lot15"] = np.log(df["sqft_lot15"])
data_log["price"] = np.log(df["price"])


  after removing the cwd from sys.path.
