# Simple example script to illustrate fit/predict ML pipeline of scikit-learn

In [None]:
# Uncomment to upgrade packages
#!pip3 install pandas --upgrade --user --quiet
#!pip3 install numpy --upgrade --user --quiet
#!pip3 install scikit-learn --upgrade --user --quiet

In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
import pandas as pd

pd.set_option("display.precision", 2)

## 1. Load `murders.txt` data from class

In [None]:
df = pd.read_csv("murders.txt", sep=" ")
print(df)

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(df, alpha=0.8, figsize=(7, 7), diagonal="kde");

In [None]:
# put data into X matrix and y column

X = df[["inhabitants", "poverty", "unemployment"]].to_numpy()
y = df[["murders"]].to_numpy()

print(X.shape, y.shape)

## 2. Train/test split for later validation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=0
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

## 3. Train linear model to predict `murders` 

The `LinearRegression` model from scikit-learn uses the _least squares_ method explained in class to find linear coefficients $\theta_0, \theta_1, ..$, remember:

$$\theta_{lsm} = (X^T X)^{-1} X^T y$$

### train model

In [None]:
from sklearn.linear_model import LinearRegression

# instantiate model's object
model1 = LinearRegression()

# train model's object with X,y data  (basically this is doing what the formula above shows)
model1.fit(X_train, y_train)

# coefficients (the _thetas_ are stored in these locations of the model; intercept is separate from rest)
print(model1.coef_, model1.intercept_)

### show predictions + errors on training data points

In [None]:
# extract learned coefficients
theta_vector = np.array(model1.coef_[0]).reshape((3, 1))
bias = model1.intercept_[0]

# make predictions for training points
y_pred = model1.predict(
    X_train
)  # basically doing: y_pred = X_train @ theta_vector + bias
y_pred_sanity_check = (
    X_train @ theta_vector + bias
)  # sanity check, make sure they're the same in table below

# compute squared error for each example
abs_error = np.abs(y_pred - y_train)
sq_error = np.square(y_pred - y_train)

# print training data with predictions
dict_data = {
    "poverty": X_train[:, 1].ravel(),
    "target": y_train.ravel(),
    "pred(sklearn)": y_pred.ravel(),
    "pred(formula)": y_pred_sanity_check.ravel(),
    "abs_error": abs_error.ravel(),
    "squared_error": sq_error.ravel(),
}
print(pd.DataFrame.from_dict(dict_data))

# show _mean squared error_
print(f"\nThe training mean squared error is: {mean_squared_error(y_pred, y_train)}")

## 4. Test trained linear model to predict `murders` on __unseen data__  (test partition)

In [None]:
# make predictions on unseen test points (from test partition)
y_pred = model1.predict(
    X_test
)  # basically doing: y_pred = X_test @ theta_vector + bias

# compute squared error for each example
abs_error = np.abs(y_pred - y_test)
sq_error = np.square(y_pred - y_test)

# show them in table
dict_data = {
    "poverty": X_test[:, 1].ravel(),
    "target": y_test.ravel(),
    "pred": y_pred.ravel(),
    "abs_error": abs_error.ravel(),
    "squared_error": sq_error.ravel(),
}
print(pd.DataFrame.from_dict(dict_data))

print(f"\nThe TEST mean squared error is: {mean_squared_error(y_pred, y_test)}")

## 5. `vander` polinomial expansion for next script..

In the next script we are going to see how to expand the input dataset using polynomial features from original data.
Here we show in a simple manner how to do this with [numpy's vander](https://numpy.org/doc/stable/reference/generated/numpy.vander.html) function,
which just computes $[x^0, x^1, x^2, ..]$ from feature $x$.

In [None]:
x = np.array([1, 2, 3, 5])
deg = 3
np.vander(x, deg + 1, increasing=True)