In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor

from sklearn.datasets import load_iris

### Iris Dataset

`scikit-learn` comes with a few small standard datasets that do not require to download any file from some external website. The dataset is usually downloaded in a `dictionary`.

In [11]:
data = load_iris()
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [12]:
print(data['DESCR'])

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

Next, we will create a `DataFrame` using the downloaded data.

In [13]:
iris = pd.DataFrame(data['data'], columns=data['feature_names'])
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


We will use `sepal width`, `petal length`, and `petal width` to predict `sepal length`.

In [14]:
# Features
X = iris.iloc[:, 1:]
X.head()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm)
0,3.5,1.4,0.2
1,3.0,1.4,0.2
2,3.2,1.3,0.2
3,3.1,1.5,0.2
4,3.6,1.4,0.2


In [15]:
# Target
y = iris.iloc[:, 0]
y.head()

0    5.1
1    4.9
2    4.7
3    4.6
4    5.0
Name: sepal length (cm), dtype: float64

### Standardization / Scaling

Since we will be using `SGDRegressor`, it is necessary that all the features are scaled similarly. For this, we will use the `StandardScaler` or the `MinMaxScaler`. You can find more about feature scaling [here](https://scikit-learn.org/stable/modules/preprocessing.html).

In [16]:
mm_scaler = MinMaxScaler(feature_range=(0, 1))
std_scaler = StandardScaler()

X_new_mm = mm_scaler.fit_transform(X)
X_new_std = std_scaler.fit_transform(X)

As you can see, all the features are now between $0$ and $1$. 

In [18]:
pd.DataFrame(X_new_mm, columns=data['feature_names'][1:]).head()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm)
0,0.625,0.067797,0.041667
1,0.416667,0.067797,0.041667
2,0.5,0.050847,0.041667
3,0.458333,0.084746,0.041667
4,0.666667,0.067797,0.041667


Similarly, all the features after standard scaling have now mean $0$ and standard deviation $1$.

In [20]:
pd.DataFrame(X_new_std, columns=data['feature_names'][1:]).head()

Unnamed: 0,sepal width (cm),petal length (cm),petal width (cm)
0,1.019004,-1.340227,-1.315444
1,-0.131979,-1.340227,-1.315444
2,0.328414,-1.397064,-1.315444
3,0.098217,-1.283389,-1.315444
4,1.249201,-1.340227,-1.315444


### Train-Test Split

In [35]:
X_train, X_test, y_train, y_test = train_test_split(
    X_new_mm,
    y,
    train_size=0.80,
    random_state=25,
    shuffle=True,
)

In [36]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 3)
(30, 3)
(120,)
(30,)


### Modelling

In [37]:
model = SGDRegressor(
    fit_intercept=False,
    loss='squared_error',
    penalty='l1',
    alpha=0.5,
    max_iter=500,
    early_stopping=True,
    n_iter_no_change=10,
    tol=1e-3,
    learning_rate='constant',
    eta0=0.01,
    verbose=2
)

In [38]:
model.fit(X_train, y_train)

-- Epoch 1
Norm: 3.49, NNZs: 3, Bias: 0.000000, T: 108, Avg. loss: 10.626335
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 5.42, NNZs: 3, Bias: 0.000000, T: 216, Avg. loss: 4.338255
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 6.63, NNZs: 3, Bias: 0.000000, T: 324, Avg. loss: 2.634900
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 7.54, NNZs: 3, Bias: 0.000000, T: 432, Avg. loss: 2.031096
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 8.28, NNZs: 3, Bias: 0.000000, T: 540, Avg. loss: 1.751867
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 8.94, NNZs: 3, Bias: 0.000000, T: 648, Avg. loss: 1.618806
Total training time: 0.00 seconds.
-- Epoch 7
Norm: 9.54, NNZs: 3, Bias: 0.000000, T: 756, Avg. loss: 1.519056
Total training time: 0.00 seconds.
-- Epoch 8
Norm: 10.11, NNZs: 3, Bias: 0.000000, T: 864, Avg. loss: 1.461422
Total training time: 0.00 seconds.
-- Epoch 9
Norm: 10.64, NNZs: 3, Bias: 0.000000, T: 972, Avg. loss: 1.398808
Total training time: 0.00 seconds

In [39]:
model.coef_

array([5.22152497, 3.95283961, 0.82921081])

In [40]:
model.intercept_

array([0.])

### Evaluation

In [41]:
# Training Data Evaluation
training_predictions = model.predict(X_train)

training_mae = mean_absolute_error(y_train, training_predictions)
training_mse = mean_squared_error(y_train, training_predictions)
training_rmse = np.sqrt(training_mse)

print("Training MAE: ", training_mae)
print("Training MSE: ", training_mse)
print("Training RMSE: ", training_rmse)

Training MAE:  1.3364790091582843
Training MSE:  2.25597657011425
Training RMSE:  1.5019908688518215


In [42]:
# Test Data Evaluation
test_predictions = model.predict(X_test)

test_mae = mean_absolute_error(y_test, test_predictions)
test_mse = mean_squared_error(y_test, test_predictions)
test_rmse = np.sqrt(test_mse)

print("Test MAE: ", test_mae)
print("Test MSE: ", test_mse)
print("Test RMSE: ", test_rmse)

Test MAE:  1.4687237267988595
Test MSE:  2.7527521500691408
Test RMSE:  1.659141992136038
