In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Generating a Linear Dataset
We will create a dummy dataset with $1$ feature and $1$ target variable. There will be a linear relationship between the feature and the target, but with some noise added to it.

In [None]:
# Feature
X = 2 * np.random.rand(100, 1)
# Target. The relationship is y = 3x + 4 + some random noise
y = 4 + 3 * X + np.random.randn(100, 1)

As you can see below, the relationship between the feature and the target is not perfectly linear. However, it is evident that there is an overall linear trend among them. For example, as the value of $X$ increases, the value of $y$ also increases linearly.

In [None]:
plt.scatter(X, y)
plt.xlabel("X")
plt.ylabel("y")
plt.title("Scatter plot of X (feature) and y (target)")
plt.show()

### Linear Model Using Normal Equation
We will be using the `LinearRegression` class from `sklearn` module to develop a linear model. You can find more about the available parameters [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html).

In [None]:
model = LinearRegression(
    fit_intercept=False,
    copy_X=True,
    n_jobs=-1,
    positive=True
)

In [None]:
model.fit(X, y)

In [None]:
print('Coefficient of feature: ', model.coef_)
print('Intercept: ', model.intercept_)

In [None]:
predictions = model.predict(X)

plt.scatter(X, y, alpha=0.75, edgecolors='black', label='Actual Value')
plt.scatter(X, predictions, alpha=0.75, edgecolors='black', label='Predictions')
plt.legend()
plt.show()

### Linear Model Using Gradient Descent

Next, we will build a linear model using stochastic gradient descent. We will use the `SGDRegressor` from `sklearn` module. You will find more about the available parameters [here](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html).

In [None]:
model2 = SGDRegressor(
    fit_intercept=True,
    loss='squared_error',
    penalty='l1',
    alpha=0.5,
    max_iter=1000,
    early_stopping=True,
    n_iter_no_change=10,
    tol=1e-3,
    learning_rate='constant',
    eta0=0.0001,
    verbose=3
)

In [None]:
model2.fit(X, y)

In [None]:
print('Coefficient of feature: ', model2.coef_.item())
print('Intercept: ', model2.intercept_)

In [None]:
fake_data = np.array([1.24])

print(model.predict([fake_data]))
print(model2.predict([fake_data]))

In [None]:
predictions = model2.predict(X)

plt.scatter(X, y, alpha=0.75, edgecolors='black', label='Actual Value')
plt.scatter(X, predictions, alpha=0.75, edgecolors='black', label='Predictions')
plt.legend()
plt.show()

### Generating a Non-Linear Dataset
We will create a dummy dataset with $1$ feature and $1$ target variable. There will be a non-linear relationship between the feature and the target, and with some noise added to it.

In [None]:
# total data
m = 100
# feature
X = 6 * np.random.rand(m, 1) - 3
# target
y = 0.5 * X**2 + X + 2 + np.random.randn(m, 1)

In [None]:
plt.scatter(X, y)
plt.xlabel("X")
plt.ylabel("y")
plt.title("Scatter plot of X (feature) and y (target)")
plt.show()

Converting $X$ into polynomial form. For this example, we will convert it to a $3$ degree polynomial. We will use the `PolynomialFeatures` class from `sklearn` and use the `degree` parameter to set the desired degree.

### Polynomial Regression

In [None]:
poly_features = PolynomialFeatures(degree=3, include_bias=False)
X_poly = poly_features.fit_transform(X)

In [None]:
X_poly[0:3, ]

In [None]:
scaler = StandardScaler()
X_poly = scaler.fit_transform(X_poly)

In [None]:
X_poly[0:3, ]

In [None]:
model3 = LinearRegression()

In [None]:
model3.fit(X_poly, y)

In [None]:
predictions = model3.predict(X_poly)

plt.scatter(X, y, alpha=0.75, edgecolors='black', label='Actual Value')
plt.scatter(X, predictions, alpha=0.75, edgecolors='black', label='Predictions')
plt.legend()
plt.show()

# Example

In [3]:
data = pd.read_excel('./data/real_estate.xlsx')
data.head(3)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,1,2012.916667,32.0,84.87882,10,24.98298,121.54024,37.9
1,2,2012.916667,19.5,306.5947,9,24.98034,121.53951,42.2
2,3,2013.583333,13.3,561.9845,5,24.98746,121.54391,47.3


In [4]:
data.tail(3)

Unnamed: 0,No,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
411,412,2013.25,18.8,390.9696,7,24.97923,121.53986,40.6
412,413,2013.0,8.1,104.8101,5,24.96674,121.54067,52.5
413,414,2013.5,6.5,90.45606,9,24.97433,121.5431,63.9


In [5]:
X = data.iloc[:, 4:7]
X.sample(3)

Unnamed: 0,X4 number of convenience stores,X5 latitude,X6 longitude
113,6,24.96172,121.53812
292,5,24.95674,121.534
325,8,24.97015,121.54494


In [6]:
y = data.iloc[:, -1]
y.sample(3)

280    45.4
407    22.3
102    54.4
Name: Y house price of unit area, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=25)

In [8]:
print(X_train.shape)
print(X_test.shape)

(331, 3)
(83, 3)


## Important

In [9]:
poly = PolynomialFeatures(degree=1)
poly.fit(X_train)

X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

In [10]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
model = LinearRegression(
    fit_intercept=False,
    copy_X=True,
    n_jobs=8,
    positive=True
)

In [12]:
model.fit(X_train, y_train)

In [13]:
model.coef_

array([0.        , 4.0187373 , 4.86045291, 3.34707299])

In [14]:
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

In [15]:
print('Training RMSE: ', np.sqrt(mean_squared_error(y_train, train_preds)))
print('Test RMSE: ', np.sqrt(mean_squared_error(y_test, test_preds)))

Training RMSE:  39.140202348849556
Test RMSE:  39.995356131585204


In [16]:
pred_10 = model.predict([X_test[10, :]])

In [17]:
print('Actual Value: ', y_test.iloc[10])
print('Prediction: ', pred_10)

Actual Value:  19.2
Prediction:  [-24.51226851]


In [19]:
np.max(y) - np.min(y)

109.9

In [None]:
1000