# Import

In [1]:
%matplotlib inline

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

sns.set_style("whitegrid")

Change directory to get source code:

In [2]:
main_path = os.getcwd()
parent_path = Path(os.getcwd()).parent
src_path = str(parent_path) + "/src/data"
os.chdir(src_path)

More import:

In [3]:
import change_directory

Change back to notebook directory:

In [4]:
os.chdir(main_path)

# Modeling

## Load data

Change directory to get data:

In [5]:
cd = change_directory.ChangeDirectory()
cd.change_to_data_dir()

Load data:

In [6]:
properties = pd.read_csv("processed/Processed_DC_Properties.csv")

Change back to notebook directory:

In [7]:
cd.change_to_notebook_dir()

## Baseline model

In [8]:
y1 = properties.PRICE
y2 = properties["log(PRICE)"]
X = properties.drop(["PRICE", "SALEDATE", "log(PRICE)"], axis=1)

In [9]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=5864)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=14865)

In [10]:
linreg1 = LinearRegression()
linreg1.fit(X_train1, y_train1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [11]:
# # coefficients
# linreg1.coef_

In [12]:
# # intercept
# linreg1.intercept_

In [13]:
def adjusted_r_squared(r_squared, num_samples, num_regressors):
    return 1 - ((1-r_squared)*(num_samples - 1) / (num_samples - num_regressors - 1))

In [14]:
linreg1.score(X_train1, y_train1)

0.7457585918124652

In [15]:
adjusted_r_squared(linreg1.score(X_train1, y_train1), X_train1.shape[0], X_train1.shape[1])

0.7438527959109137

In [16]:
linreg1.score(X_test1, y_test1)

0.6332799676280032

In [17]:
adjusted_r_squared(linreg1.score(X_test1, y_test1), X_test1.shape[0], X_test1.shape[1])

0.6220304608595382

not generalized enough

In [18]:
linreg2 = LinearRegression()
linreg2.fit(X_train2, y_train2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [19]:
linreg2.score(X_train2, y_train2)

0.7504533507996347

In [20]:
adjusted_r_squared(linreg2.score(X_train2, y_train2), X_train2.shape[0], X_train2.shape[1])

0.7485827468540289

In [21]:
linreg2.score(X_test2, y_test2)

0.7396248828779651

In [22]:
adjusted_r_squared(linreg2.score(X_test2, y_test2), X_test2.shape[0], X_test2.shape[1])

0.7316376136157479

much better.. in terms of adjusted R2

In [23]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.0001)

In [24]:
clf.fit(X_train2, y_train2)

Lasso(alpha=0.0001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [25]:
clf.score(X_train2, y_train2)

0.7478724936969741

In [26]:
adjusted_r_squared(clf.score(X_train2, y_train2), X_train2.shape[0], X_train2.shape[1])

0.7459825436231202