# Exploring Cross-Validation, Overfitting, and Underfitting


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

np.random.seed(0)
X, y = make_regression(n_samples=100, n_features=1, noise=10, bias=10)
X = np.sort(X, axis=0)  
y = y + 0.5 * X**2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

plt.scatter(X, y, color='blue', label='Data')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Synthetic Data')
plt.legend()
plt.show()


# Underfitting

In [None]:

model_linear = LinearRegression()
cv_scores_linear = cross_val_score(model_linear, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_score_linear = -np.mean(cv_scores_linear)

model_linear.fit(X_train, y_train)

y_pred_linear = model_linear.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)

print(f"Linear Regression Model - Mean CV MSE: {mean_cv_score_linear:.2f}")
print(f"Linear Regression Model - Test MSE: {mse_linear:.2f}")

plt.scatter(X, y, color='blue', label='Data')
plt.plot(X, model_linear.predict(X), color='red', label='Linear Fit')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Linear Regression (Underfitting)')
plt.legend()
plt.show()


# Overfitting

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

degree = 15
model_poly = make_pipeline(PolynomialFeatures(degree), LinearRegression())
cv_scores_poly = cross_val_score(model_poly, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
mean_cv_score_poly = -np.mean(cv_scores_poly)

model_poly.fit(X_train, y_train)

y_pred_poly = model_poly.predict(X_test)
mse_poly = mean_squared_error(y_test, y_pred_poly)

print(f"Polynomial Regression (Degree {degree}) - Mean CV MSE: {mean_cv_score_poly:.2f}")
print(f"Polynomial Regression (Degree {degree}) - Test MSE: {mse_poly:.2f}")

plt.scatter(X, y, color='blue', label='Data')
X_fit = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
plt.plot(X_fit, model_poly.predict(X_fit), color='green', label='Polynomial Fit')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.title('Polynomial Regression (Overfitting)')
plt.legend()
plt.show()
