In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn import set_config
set_config(display="diagram")

# Linear Algebra 3

### Part 3: Projection Matrix

Say X and y are known, but we can't solve for c because X has more rows than columns:

### <font color='red'>$Xc = y$</font>

We can, however, usually (unless there are multiple equally good solutions) solve the following, which we get by multiplying both sides by $X^T$:

### <font color='red'>$X^TXc = X^Ty$</font>

If we can find a c to make the above true, we can multiple both sides by $(X^TX)^{-1}$ (which generally exists unless X columns are redundant) to get this equation:

$(X^TX)^{-1}X^TXc = (X^TX)^{-1}X^Ty$

Simplify:

$c = (X^TX)^{-1}X^Ty$

Multiply both sides by X:

### <font color='red'>$Xc = X(X^TX)^{-1}X^Ty$</font>

### Note we started with an unsolveable $Xc = y$ problem but multiplied $y$ by something to get a different $Xc = ????$ that is solveable.

Define <font color="red">$P = X(X^TX)^{-1}X^T$</font>.  This is a **projection matrix**.  If you multiply a vector by $P$, you get back a new vector of the same size, with two properties:

1. it will be in the column space of $X$
2. the new vector will be as "close as possible" to the original vector

Note: computing P is generally very expensive.

<img src="projection.png" width="80%">

**Note:** $\theta$ in the image above is the same as $c$

Cite: [CS760 Regression Slide](https://pages.cs.wisc.edu/~fredsala/cs760/fall2021/slides/lecture6-regression.pdf)

### Fruit Sales Example

In [None]:
X = np.array([
    [10, 0, 1],
    [2, 8, 1],
    [4, 4, 1],
    [10, 4, 1],
    [10, 4, 1]
])
y = np.array([7, 5, 5, 8, 8.5]).reshape(-1, 1)
y

In [None]:
c = np.linalg.solve(X, y)
c

Multiply both sides by `X.T` ---> this will usually make it solvable.

In [None]:
c = np.linalg.solve(????, ????)
c

What is special about multiplication of a matrix with its transpose? Resultant shape is always a square.

In [None]:
X.T.shape

In [None]:
X.shape

In [None]:
(X.T @ X).shape

Let's compute $P = X(X^TX)^{-1}X^T$.

- **IMPORTANT**: We are not going to discuss how inverse works. That is beyond the scope of CS320.

### `np.linalg.inv(a)`

- computes the (multiplicative) inverse of a matrix.
- documentation: https://numpy.org/doc/stable/reference/generated/numpy.linalg.inv.html

In [None]:
P = ????
P

In [None]:
X

In [None]:
y

The new vector will be as "close as possible" to the original vector.

In [None]:
P @ y

#### Scatter plot visualization

**IMPORTANT**: We are not going to discuss how `np.random.normal` works. You can look up the documentation if you are interested.

In [None]:
x = np.random.normal(5, 2, size=(10, 1))
y = 2*x + np.random.normal(size=x.shape)
df = pd.DataFrame({"x": x.reshape(-1), "y": y.reshape(-1)})
df

In [None]:
df.plot.scatter(x="x", y="y", figsize=(5, 5))

In [None]:
X = ????.????
X

In [None]:
P = X @ np.linalg.inv(X.T @ X) @ X.T
P

In [None]:
df["p"] = P @ ????
df

In [None]:
ax = df.plot.scatter(x="x", y="y", figsize=(5, 5), color="k")
df.plot.scatter(x="x", y=????, color="r", ax=ax)

### Euclidean Distance between columns

- how close is the new vector (`P @ y`) to the original vector (`y`)?
- $dist$ = $\sqrt{(x2 - x1)^2 + (y2 - y1)^2}$

In [None]:
coords = pd.DataFrame({
    "v1": [1, 8],
    "v2": [4, 12],
}, index=["x", "y"])
coords

In [None]:
# distance between v1 and v2 is 5
((coords["v1"] - coords["v2"]) ** 2).sum() ** 0.5 

In [None]:
# this is the smallest possible distance between y and p, such
# that X @ c = p is solveable
((???? - ????) ** 2).sum() ** 0.5

### Lab review

In [None]:
# As an exception, I am providing all the relevant import statements in this cell
import numpy as np
import rasterio
from rasterio.mask import mask
from shapely.geometry import box
import geopandas as gpd

land = rasterio.open("zip://land.zip!wi.tif")
# a = land.read()
window = gpd.GeoSeries([box(-89.5, 43, -89.2, 43.2)]).set_crs("epsg:4326").to_crs(land.crs)
plt.imshow(mask(land, window, crop=True)[0][0])

# Classification 1

In [None]:
data = datasets.load_iris()
df = pd.DataFrame(data["data"], columns=data["feature_names"]).drop(columns=["petal length (cm)"])
df.insert(2, "const", 1)
df["variety"] = data["target_names"][data["target"]]
df.insert(4, "setosa", df["variety"] == "setosa")

In [None]:
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
train, test = train_test_split(df, test_size=10, random_state=5)
test

### Model 1: Predict petal width

- regression problem

In [None]:
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
ycol = ????

# 1. initialize model
reg_model = ????(fit_intercept=False)
# 2. fit using train data
reg_model.fit(????, ????)
# 3. predict for test data and add predictions as a column
test["pet_width_predictions"] = reg_model.predict(????)
test

In [None]:
reg_model.score(????, ????)

### LogisticRegression

- classification model
- predict categorical labels

### Model 2: Predict whether flower is "setosa"

- classification problem, specifically binary classification: True / False

In [None]:
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
ycol = ????

# 1. initialize model
cls_model = ????(fit_intercept=False)
# 2. fit using train data
cls_model.fit(train[xcols], train[ycol])
# 3. predict for test data and add predictions as a column
test["setosa_predictions"] = cls_model.predict(test[xcols])
test

What is the accuracy? That is what percent of the time is it correct?

Review of `score` method.

In [None]:
cls_model.score(test[xcols], test[ycol])

### Model 2b: Predict probablity of flower being "setosa"

- classification problem, probablity between 0 to 1

#### `<model object>.predict_proba(X)`

- Calibrated probabilities of classification.
- returns a numpy array of probabilities (that it is True):
    - [[False probablity, True probablity], ...]
- documentation: https://scikit-learn.org/stable/modules/generated/sklearn.calibration.CalibratedClassifierCV.html#sklearn.calibration.CalibratedClassifierCV.predict_proba

In [None]:
cls_model.????(test[xcols])  

Extract just the True probablities.

In [None]:
cls_model.predict_proba(test[xcols])[:, ????]

In [None]:
test["setosa_prob"] = cls_model.predict_proba(test[xcols])[:, ????]
test

### Model 4: Predict variety of flower

- classification problem, specifically multi-class classification for `variety`

In [None]:
# PREDICT: which of the 3 varieties is a particular Iris?
xcols = ["sepal length (cm)", "sepal width (cm)", "const"]
ycol = ????

# 1. initialize model
mult_model = LogisticRegression(fit_intercept=False)
# 2. fit using train data
mult_model.fit(train[xcols], train[ycol])
# 3. predict for test data and add predictions as a column
test["variety_predictions"] = mult_model.predict(test[xcols])
test

What is the accuracy?

In [None]:
mult_model.score(test[xcols], test[ycol])