# Regression analysis with SVD decomposition for a real covariance with a fake model

In [None]:
import pandas as pd
import numpy as np
from numpy.random import multivariate_normal, normal
from numpy.linalg import cholesky, svd
from scipy.sparse.linalg import lsqr
from itertools import accumulate
from sklearn.linear_model import LinearRegression
from scipy.linalg import lstsq
from scipy.optimize import nnls

import pytest

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import sandy

## Real covariance with few groups (33)

In [None]:
tape = sandy.get_endf6_file("jeff_33", "xs", 922350)
mt = 102
err = tape.get_errorr(err=1, xs=True, nubar=False, chi=False, mubar=False, errorr33_kws=dict(mt=mt, ign=19), verbose=True)["errorr33"]
cov = err.get_cov()

In [None]:
C = cov.data.copy()
C = pd.DataFrame(C.values + np.diag(np.diag(C.values) * 0.5 / 100 + 0.0001), index=C.index, columns=C.columns)
cov_ = sandy.CategoryCov(C)
U, S, V = svd(C)
Lambda12 = np.diag(np.sqrt(S))
np.testing.assert_almost_equal((U @ Lambda12) @ (U @ Lambda12).T, C.values)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(cov_.get_corr().data, ax=ax, cmap="bwr", vmin=-1, vmax=1)
fig.tight_layout()

In [None]:
M = cov.data.shape[0]  # number of parameters

frac = S / S.sum()
acc = np.array(list(accumulate(frac)))
r = acc[acc < 0.98].size + 1

N = 5000  # number of samples

print(f"sample size:                  {N:>10}\nnumber of parameters:         {M:>10}\nreduced number of parameters: {r:>10}")

In [None]:
# non correlated, standardized sample
X_ = normal(loc=1., scale=1., size=N * M).reshape(N, M)

# correlated sample
X = X_ @ (U @ Lambda12).T

In [None]:
f = (np.arange(M) + 1)[::-1]
def model(x): return f @ x

In [None]:
Y = model(X.T)
Z = lstsq(X, Y)[0]
#u, s, vh = svd(X, full_matrices=False)
#Z1 = (vh.T @ np.diag(1/s) @ u.T @ Y.reshape(-1, 1)).flatten()  # svd decomposition, same as in lstsq

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))

data = {}
for fv in np.array([0.95, 0.98, 0.99, 0.999]):
    n = acc[acc < fv].size + 1
    Y = model(X[:n, :].T)
    Z = lstsq(X[:n, :], Y)[0]
    data[f"N={n}, FV={fv}"] = Z

Y = model(X.T)
Z = lstsq(X, Y)[0]
data[f"N={N}"] = Z

pd.DataFrame(data).plot(kind="line", ax=ax)
ax.set(
    ylim=[-50, f.max()]
)

fig.tight_layout()

## Real covariance with many groups (1968)

In [None]:
tape = sandy.get_endf6_file("jeff_33", "xs", 922350)
mt = 102
err = tape.get_errorr(err=1, xs=True, nubar=False, chi=False, mubar=False, errorr33_kws=dict(mt=mt, ign=20), verbose=True)["errorr33"]
cov = err.get_cov()

In [None]:
C = cov.data.copy()
C = pd.DataFrame(C.values + np.diag(np.diag(C.values) * 0.5 / 100 + 0.0001), index=C.index, columns=C.columns)
cov_ = sandy.CategoryCov(C)
U, S, V = svd(C)
Lambda12 = np.diag(np.sqrt(S))
np.testing.assert_almost_equal((U @ Lambda12) @ (U @ Lambda12).T, C.values)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(cov_.get_corr().data, ax=ax, cmap="bwr", vmin=-1, vmax=1)
fig.tight_layout()

In [None]:
M = cov.data.shape[0]  # number of parameters

frac = S / S.sum()
acc = np.array(list(accumulate(frac)))
r = acc[acc < 0.98].size + 1

N = 5000  # number of samples

print(f"sample size:                  {N:>10}\nnumber of parameters:         {M:>10}\nreduced number of parameters: {r:>10}")

In [None]:
# non correlated, standardized sample
X_ = normal(loc=1., scale=1., size=N * M).reshape(N, M)

# correlated sample
X = X_ @ (U @ Lambda12).T

In [None]:
f = (np.arange(M) + 1)[::-1]
def model(x): return f @ x

In [None]:
#u, s, vh = svd(X, full_matrices=False)
#Z1 = (vh.T @ np.diag(1/s) @ u.T @ Y.reshape(-1, 1)).flatten()  # svd decomposition, same as in lstsq

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))

data = {}
for fv in np.array([0.95, 0.98, 0.99, 0.999]):
    n = acc[acc < fv].size + 1
    Y = model(X[:n, :].T)
    Z = lstsq(X[:n, :], Y)[0]
    data[f"N={n}, FV={fv}"] = Z

Y = model(X.T)
Z = lstsq(X, Y)[0]
data[f"N={N}"] = Z

pd.DataFrame(data).plot(kind="line", ax=ax)
ax.set(
    ylim=[-50, f.max()]
)

fig.tight_layout()

## Real covariance with many groups (239) and many reactions (all=8)

In [None]:
tape = sandy.get_endf6_file("jeff_33", "xs", 922350)
mt = 102
err = tape.get_errorr(err=1, xs=True, nubar=False, chi=False, mubar=False, errorr33_kws=dict(ign=2), verbose=True)["errorr33"]
cov = err.get_cov()

In [None]:
C = cov.data.copy()
print(f"condition number original matrix: {np.linalg.cond(C):>10}")
C = pd.DataFrame(C.values + np.diag(np.diag(C.values) * 0.5 / 100 + 0.0001), index=C.index, columns=C.columns)
print(f"condition number adjusted matrix: {np.linalg.cond(C):>10}")
cov_ = sandy.CategoryCov(C)
U, S, V = svd(C)
Lambda12 = np.diag(np.sqrt(S))
np.testing.assert_almost_equal((U @ Lambda12) @ (U @ Lambda12).T, C.values)

In [None]:
fig, ax = plt.subplots(figsize=(6, 6))
sns.heatmap(cov_.get_corr().data, ax=ax, cmap="bwr", vmin=-1, vmax=1)
fig.tight_layout()

In [None]:
M = cov.data.shape[0]  # number of parameters

frac = S / S.sum()
acc = np.array(list(accumulate(frac)))
r = acc[acc < 0.98].size + 1

N = 5000  # number of samples

print(f"sample size:                  {N:>10}\nnumber of parameters:         {M:>10}\nreduced number of parameters: {r:>10}")

In [None]:
# non correlated, standardized sample
X_ = normal(loc=1., scale=1., size=N * M).reshape(N, M)

# correlated sample
X = X_ @ (U @ Lambda12).T

In [None]:
f = (np.arange(M) + 1)[::-1]
def model(x): return f @ x

In [None]:
fig, ax = plt.subplots(figsize=(15, 4))

data = {}
for fv in np.array([0.95, 0.98, 0.99, 0.999]):
    n = acc[acc < fv].size + 1
    Y = model(X[:n, :].T)
    Z = lstsq(X[:n, :], Y)[0]
    data[f"N={n}, FV={fv}"] = Z

Y = model(X.T)
Z = lstsq(X, Y)[0]
data[f"N={N}"] = Z

pd.DataFrame(data).plot(kind="line", ax=ax)
ax.set(
    ylim=[-50, f.max()]
)

fig.tight_layout()