This notebook fits a baseline linear regression, evaluates R²/RMSE, and runs residual diagnostics. You can switch to your own CSV by editing the config below.

In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from scipy import stats
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [None]:
PATH_TO_CSV = r"/mnt/data/stage10a_project/data/synthetic_regression.csv"
TARGET_COL = "y"
FEATURE_COLS = []

In [None]:
df = pd.read_csv(PATH_TO_CSV)
import numpy as np
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET_COL not in numeric_cols:
    raise ValueError(f"TARGET_COL '{TARGET_COL}' not found among numeric columns: {numeric_cols}")
feature_cols = FEATURE_COLS if FEATURE_COLS else [c for c in numeric_cols if c != TARGET_COL]
X, y = df[feature_cols].copy(), df[TARGET_COL].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
linreg = LinearRegression().fit(X_train, y_train)
y_train_pred, y_test_pred = linreg.predict(X_train), linreg.predict(X_test)
r2_train, r2_test = r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)
rmse_train = mean_squared_error(y_train, y_train_pred, squared=False)
rmse_test  = mean_squared_error(y_test,  y_test_pred,  squared=False)
print("Features:", feature_cols)
print("Coefficients:", dict(zip(feature_cols, linreg.coef_)))
print("Intercept:", linreg.intercept_)
print(f"R^2 train={r2_train:.4f}  test={r2_test:.4f}")
print(f"RMSE train={rmse_train:.4f}  test={rmse_test:.4f}")

In [None]:
resid = y_test - y_test_pred
plt.figure(); plt.scatter(y_test_pred, resid, alpha=0.8); plt.axhline(0, linestyle="--"); plt.title("Residuals vs Fitted"); plt.xlabel("Fitted (test)"); plt.ylabel("Residuals"); plt.show()
plt.figure(); plt.hist(resid, bins=30); plt.title("Residual Histogram (test)"); plt.xlabel("Residual"); plt.ylabel("Count"); plt.show()
plt.figure(); import scipy.stats as st; st.probplot(resid, dist="norm", plot=plt); plt.title("QQ Plot (test)"); plt.show()
key = X_test.columns[0]; plt.figure(); plt.scatter(X_test[key], resid, alpha=0.8); plt.axhline(0, linestyle="--"); plt.title(f"Residuals vs {key}"); plt.xlabel(key); plt.ylabel("Residuals"); plt.show()
r = resid.reset_index(drop=True); r_lag = r.shift(1); valid = r_lag.notna()
import numpy as np; corr = np.corrcoef(r[valid], r_lag[valid])[0,1]
plt.figure(); plt.scatter(r[valid], r_lag[valid], alpha=0.8); plt.title(f"Lag-1 Residual Scatter (corr ≈ {corr:.3f})"); plt.xlabel("Residual[t]"); plt.ylabel("Residual[t-1]"); plt.show()