# Data Preprocessing

## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder

## Data Loading

In [None]:
boston = datasets.load_boston()
dir(boston)

In [None]:
print(boston.DESCR)

In [None]:
df = pd.DataFrame(boston.data, columns=boston.feature_names)

In [None]:
df.head()

In [None]:
df.boxplot(rot=45)

## Data Exploring

In [None]:
X, y = boston.data, boston.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

## Data Visualization

In [None]:
fig, axes = plt.subplots(3, 5, figsize=(20, 10))
for i, ax in enumerate(axes.flat):
    if i > 12:
        ax.set_visible(False)
        continue
    ax.plot(X[:, i], y, 'o', alpha=.5)
    ax.set_title("{}: {}".format(i, boston.feature_names[i]))
    ax.set_ylabel("MEDV")

In [None]:
plt.boxplot(X)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right");
plt.savefig("../images/boston_unscaled_box.png")

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
plt.boxplot(X_train_scaled)
plt.xticks(np.arange(1, X.shape[1] + 1),
           boston.feature_names, rotation=30, ha="right");
plt.savefig("../images/boston_scaled_box.png")

## Linear Regression

### Without Scaling

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

### With Scaling

In [None]:
lr_scaled = LinearRegression()
lr_scaled.fit(X_train_scaled, y_train)
lr_scaled.score(X_test_scaled, y_test)

## Ridge Regression

### Without Scaling

In [None]:
r = Ridge()
r.fit(X_train, y_train)
r.score(X_test, y_test)

### With Scaling

In [None]:
r_scaled = Ridge()
r_scaled.fit(X_train_scaled, y_train)
r_scaled.score(X_test_scaled, y_test)

## Lasso Regression

### Without Scaling

In [None]:
l = Lasso()
l.fit(X_train, y_train)
l.score(X_test, y_test)

### With Scaling

In [None]:
l_scaled = Lasso()
l_scaled.fit(X_train_scaled, y_train)
l_scaled.score(X_test_scaled, y_test)