## Linear regression using scikit-learn package

#### 1. Import package and load data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import learning_curve

import sys
sys.path.append("../src")
import process

np.set_printoptions(precision=2)

In [None]:
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df = pd.read_csv("../data/raw/automobile-price-prediction.csv", names=headers)
df.head()


#### 2. Data pre-processing

In [None]:
# clean data and drop NA
process_columns = ['engine-size', 'stroke', 'wheel-base', 'bore', 'compression-ratio', 'horsepower', 'highway-mpg', 'price']
df.replace("?",np.nan,inplace=True)
process.drop_na_data(df, process_columns)
process.change_dtypes(df, process_columns)


In [None]:
# data standardization
X_features = ['engine-size', 'stroke', 'wheel-base', 'bore', 'compression-ratio', 'horsepower', 'highway-mpg']
X = df[X_features]
y = df["price"]
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)

#### 3. Linear regression with SGD regressor

In [None]:
sgdr = SGDRegressor(loss="squared_error", max_iter=1000)  # max_iter = np.ceil(10**6/ len(y))
sgdr.fit(X_norm, y)

# print(sgdr)
print(f"Number of iterations completed: {sgdr.n_iter_}, number of weight updates: {sgdr.t_}")
print(f"Model parameters:                w: {sgdr.coef_}, b:{sgdr.intercept_}")

y_pred_gd = sgdr.predict(X_norm)
print('Multiple Linear Regression (SGD) accuracy parameters:')
print('\tThe R-square: {:.3f}'.format(sgdr.score(X_norm, y)))
print('\tOutput of the 1st 4 predicted value:', sgdr.predict(X_norm) [0:4])
mse = mean_squared_error(y, y_pred_gd)
print('\tMean square error of actual & predicted order value: {:.3f}'.format(mse))

#### 4. Linear regression with LinearRegression

In [None]:
# Utilize  scikit-learn to implement linear regression using a close form solution based on the normal equation
linear_model = LinearRegression()
linear_model.fit(X_norm, y)

# print(linear_model)
print(f"Model parameters:                w: {linear_model.coef_}, b:{linear_model.intercept_}")

y_pred_lm = linear_model.predict(X_norm)  # predict
print('Multiple Linear Regression (LinearRegression) accuracy parameters:')
print('\tThe R-square: {:.3f}'.format(linear_model.score(X_norm, y)))
print('\tOutput of the 1st 4 predicted value:', y_pred_lm[0:4])
mse = mean_squared_error(y, y_pred_lm)
print('\tMean square error of actual & predicted order value: {:.3f}'.format(mse))

#### 5. Learning curve

In [None]:
# train_sizes = [1, 40, 80, 120, 144]
train_sizes, train_scores, validation_scores = learning_curve(estimator = SGDRegressor(loss="squared_error", max_iter=500000),
                                                              X = X_norm, y = y, #train_sizes = train_sizes,
                                                              cv = 10, scoring = 'neg_mean_squared_error', shuffle=True)

train_scores_mean = -train_scores.mean(axis=1)
validation_scores_mean = -validation_scores.mean(axis=1)

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error') 
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a linear regression model', fontsize = 18, y = 1.03)
plt.legend(); plt.show()