# Housing regression example



In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew

import matplotlib.pyplot as plt
import seaborn as sns;

In [18]:
# plotly
from plotly.graph_objs import FigureWidget
import plotly.graph_objects as go
from plotly.callbacks import Points
import plotly.express as px

# ipywidgets
from ipywidgets import HTML
from ipywidgets import HBox, VBox, Button

## Data load and prep

Data set and feature preparation from: https://www.kaggle.com/apapiu/regularized-linear-models

In [2]:
df = pd.read_csv("/Users/mehrdadyazdani/Downloads/house-prices-advanced-regression-techniques/train.csv")

In [3]:
from sklearn.model_selection import train_test_split


In [4]:
train, test = train_test_split(df)
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [5]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))

In [6]:
#log transform the target:
train["SalePrice"] = np.log10(1+train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])

all_data = pd.get_dummies(all_data)

all_data = all_data.fillna(all_data.mean())

In [7]:
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y_train = train.SalePrice
y_test = np.log10(1+test.SalePrice)

In [8]:
from sklearn.linear_model import RidgeCV

In [9]:
model = RidgeCV()

In [10]:
model.fit(X_train, y_train)

RidgeCV(alphas=array([ 0.1,  1. , 10. ]), cv=None, fit_intercept=True,
        gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

## Error analysis

In regression we call "actual - predicted" as "residuals"

In [11]:
resids_train = y_train - model.predict(X_train)
resids_test = y_test - model.predict(X_test)

## Over-valued and under-valued properties

## Find similar properties

In [25]:
from sklearn.manifold import TSNE

In [26]:
tsne = TSNE(n_components=2, perplexity=100, n_iter=10000, n_iter_without_progress=50)

In [27]:
%%time
z_train = tsne.fit_transform(X_train)

CPU times: user 1min 18s, sys: 2.86 s, total: 1min 21s
Wall time: 11.3 s


In [28]:
tsne.n_iter_

1299