In [None]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split


from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures, SplineTransformer

import pandas as pd

In [None]:
def f(x):
    """Function to be approximated by polynomial interpolation."""
    return x * np.sin(x)
def foo(sigma, num, seed=787):
    rng = np.random.default_rng(seed=787)

    X = np.sort(rng.uniform(0,10, num).reshape(-1,1),axis=0)
    error = rng.normal(0, sigma, num).reshape(-1,1)
    Y = f(X) + error 
    df = pd.DataFrame({'X': X.flatten(), 'Y': Y.flatten()})
    return df

df = foo(1.5, 1000)

# whole range we want to plot
x_population = np.linspace(0, 10, 100)
x_population = np.sort(x_population).reshape(-1,1)

fig, ax = plt.subplots()

ax.plot(x_population, f(x_population), linewidth=2, label="ground truth", color="red")
ax.scatter(df["X"], df["Y"], linewidth=2, label="Data Sample",color="blue")
ax.legend()

plt.show()

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(
    df["X"],
    df["Y"],
    train_size=0.8, 
    random_state=787
    )

df_train = pd.DataFrame({'X': X_train, 'Y': Y_train}).sort_values("X").reset_index(drop=True)
df_test = pd.DataFrame({'X': X_test, 'Y': Y_test}).sort_values("X").reset_index(drop=True)

# Plot of training and test data
fig, ax = plt.subplots()
ax.scatter(df_train["X"], df_train["Y"], linewidth=2, label="Training Data")
ax.scatter(df_test["X"], df_test["Y"], linewidth=2, label="Test Data")
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.set_prop_cycle(
    color=["black", "teal", "yellowgreen", "gold", "darkorange", "tomato"]
)

ax.plot(x_population, f(x_population), linewidth=2, label="ground truth",color="red")
ax.scatter(X_train, Y_train, linewidth=2, label="Training Data")
# ax.scatter(x_test, y_test, linewidth=2, label="Test Data")

for degree in [3,4,5]:
    model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1e-3))
    model.fit(df_train["X"].values.reshape(-1,1), df_train["Y"].values.reshape(-1,1))

    # Evaluate the models using crossvalidation
    # scores = cross_val_score(
    #     model, X_test, y_test, scoring="neg_mean_squared_error", cv=10
    # )
    # print(scores.mean().item())
    df_test[f"y_hat_{degree}"] = model.predict(df_test["X"].values.reshape(-1,1))

    ax.plot(df_test["X"], df_test[f"y_hat_{degree}"], label=f"degree {degree}")

ax.legend()
plt.show()

In [None]:
model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1e-3))
model.fit(df_train["X"].values.reshape(-1,1), df_train["Y"].values.reshape(-1,1))

df

In [None]:
fig, ax = plt.subplots()

ax.plot(x_population, f(x_population), linewidth=2, label="ground truth")
ax.scatter(x_train, y_train, linewidth=2, label="Training Data")
ax.scatter(x_test, y_test, linewidth=2, label="Test Data")

model = make_pipeline(SplineTransformer(n_knots=4, degree=3), Ridge(alpha=1e-3))
model.fit(X_train, y_train)
# Evaluate the models using crossvalidation
scores = cross_val_score(
        model, X_test, y_test, scoring="neg_mean_squared_error", cv=10
    )
print(scores.mean().item())

y_plot = (model.predict(x_population)).reshape(-1,1)

ax.plot(x_population, y_plot, label="B-spline")

In [None]:
# plot function
lw = 2
fig, ax = plt.subplots()
ax.set_prop_cycle(
    color=["black", "teal", "yellowgreen", "gold", "darkorange", "tomato"]
)
ax.plot(x_plot, f(x_plot), linewidth=lw, label="ground truth")

# plot training points
ax.scatter(x_train, y_train, label="training points")

# polynomial features
for degree in [3, 4, 5]:
    model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=1e-3))
    model.fit(X_train, y_train)
    y_plot = model.predict(X_plot)
    ax.plot(x_plot, y_plot, label=f"degree {degree}")

ax.legend(loc="lower center")
ax.set_ylim(-20, 10)
plt.show()

In [None]:
scores = cross_val_score(
        model, x_train, y, scoring="neg_mean_squared_error", cv=10
    )