Corresponding video https://youtu.be/TH6vDphjZak

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('reg_house_data.csv')

In [None]:
data.head()

In [None]:
num_features = ['bedrooms', 'bathrooms', 'sqft_living', 'floors']
cat_features = ['yr_built', 'condition']

In [None]:
X = data[[*num_features, *cat_features]]
y = data['price']

In [None]:
X.shape, y.shape

In [None]:
X.yr_built.value_counts()

In [None]:
X['decade_built'] = X.yr_built.apply(lambda x: (x-1900)//10)
X.drop('yr_built', axis=1, inplace=True)

In [None]:
X.condition.value_counts()

# Linear model

1. Preprocess categorical features
2. Split data into train and test
3. Normalize the data
4. Built a baseline model
5. Optimize parameters of a linear model using grid search

### 1 Preprocess categorical features

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
ohe = OneHotEncoder(sparse=False)

In [None]:
encoded_features = pd.DataFrame(ohe.fit_transform(X[['condition', 'decade_built']]))

In [None]:
ohe.categories_

In [None]:
condition_cols = [f'condition_{cat}' for cat in ohe.categories_[0]]
dec_built_cols = [f'dec_built_{cat}' for cat in ohe.categories_[1]]

In [None]:
dummy_cols_names = [*condition_cols, *dec_built_cols]
encoded_features.columns = dummy_cols_names

In [None]:
encoded_features

In [None]:
X.drop(['condition', 'decade_built'], axis=1, inplace=True)
# X = pd.concat([X, encoded_features], axis=1)

In [None]:
encoded_features

In [None]:
X = pd.concat([X, encoded_features], axis=1)

### 2 Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

### 3 Normalize the data

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
scaler.fit(X_train[num_features])

X_train[num_features] = scaler.transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

### 4 Built a baseline model (Linear regression without regularization)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

In [None]:
reg = LinearRegression()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [None]:
mse(y_test, y_pred)

In [None]:
test_error = mse(y_test, y_pred)

In [None]:
import numpy as np

In [None]:
# RMSE

np.sqrt(test_error)

In [None]:
y_test.mean()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.hist(y_test, bins=50);
plt.xticks(rotation = 60);

In [None]:
plt.hist(y_test, bins=50);
plt.xticks(rotation = 60);
plt.vlines(np.sqrt(test_error), 0, 300);

In [None]:
plt.scatter(y_test, y_pred);

In [None]:
plt.hist(y_train, bins=50, label='True price');
plt.hist(y_pred, bins=50, label='Predicted price');
plt.legend();

### 5 Optimize parameters of a linear model

We will use Ridge regression.

In [None]:
from sklearn.linear_model import Ridge

# Train. Test. Validation.

Split data into 3 peices:

1. **Training part.** Used to fit a regression model. (model explicitly sees y_train)
2. **Validation part.** Used to optimize hyperparameters of the regression model. (model implicitly sees y_val)
3. **Test part.** Used to evaluate the perfomance of the model. (model does not see y_test at all)
---

1. Split data into Train, Validation and Test parts.
2. Train model on train part, optimize its hyperparameters using validation part.
3. After you choose best hyperparameters, train model on Train+Validation parts and predict on Test part to evaluate its perfomance.

In [None]:
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.33, random_state=10)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, random_state=1)

scaler = MinMaxScaler()
scaler.fit(X_train[num_features])

X_train[num_features] = scaler.transform(X_train[num_features])
X_val[num_features] = scaler.transform(X_val[num_features])

In [None]:
rmse_val = {}
for alpha in np.logspace(-6,1,20):
    reg = Ridge(alpha=alpha)
    reg.fit(X_train, y_train)
    y_val_pred = reg.predict(X_val)
    rmse_val[alpha] = np.sqrt(mse(y_val_pred, y_val))
    
print(min(rmse_val.items(), key=lambda x: x[1]))

In [None]:
# Same train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=10)

# # Scale numeric features
scaler = MinMaxScaler()
scaler.fit(X_train[num_features])

X_train[num_features] = scaler.transform(X_train[num_features])
X_test[num_features] = scaler.transform(X_test[num_features])

# Train model with optimal hyperparameter
reg = Ridge(alpha=min(rmse_val.items(), key=lambda x: x[1])[0])
reg.fit(X_train, y_train)

# Predict on unseen data
y_pred = reg.predict(X_test)

# Compute model performance
np.sqrt(mse(y_test, y_pred))

In [None]:
plt.figure(figsize=(12,8))
plt.scatter(y_test, y_pred);
plt.plot([0,1_000_000], [0,1_000_000], c='r')
plt.xlabel('True price')
plt.ylabel('Predicted price');

# What to do next?

1. Generate more features
2. Use another regression model