# 02 · Baseline Multiple Regression

**Goal:** Fit a simple regression as a baseline MMM to compare against Meridian later.

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error

df = pd.read_csv("../data/marketing_data_clean.csv")

In [3]:
X = df[["tv_spend","search_spend","social_spend","display_spend","price_index","promo","holiday"]]
y = df["revenue"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [5]:
reg = LinearRegression().fit(X_train, y_train)
pred = reg.predict(X_test)

In [6]:
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
import numpy as np

r2   = r2_score(y_test, pred)
mape = mean_absolute_percentage_error(y_test, pred)
mse  = mean_squared_error(y_test, pred)   # MSE
rmse = np.sqrt(mse)                       # RMSE

print({"R2": r2, "MAPE": mape, "RMSE": rmse})
print("Coefficients:", dict(zip(X.columns, reg.coef_)))

{'R2': -0.09908534853546036, 'MAPE': 0.12026463131319345, 'RMSE': np.float64(11578.55775706539)}
Coefficients: {'tv_spend': np.float64(0.2946906305584724), 'search_spend': np.float64(0.11901614749328226), 'social_spend': np.float64(0.44497602603705183), 'display_spend': np.float64(-2.3258563290471006), 'price_index': np.float64(3364.7367257163232), 'promo': np.float64(1878.2483151876543), 'holiday': np.float64(5326.523646092306)}


In [7]:
import numpy as np, pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error

df = pd.read_csv("../data/marketing_data_clean.csv").copy()
# Features: log1p spends
for c in ["tv_spend","search_spend","social_spend","display_spend"]:
    df[f"log_{c}"] = np.log1p(df[c])

# Seasonality (weekly): sin/cos with period=52
t = np.arange(len(df))
df["sin52"] = np.sin(2*np.pi*t/52)
df["cos52"] = np.cos(2*np.pi*t/52)

X = df[["log_tv_spend","log_search_spend","log_social_spend","log_display_spend",
        "price_index","promo","holiday","sin52","cos52"]]
y = np.log1p(df["revenue"])  # log target

# Time-aware split (last 20% as test)
split = int(len(df)*0.8)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=np.logspace(-3,3,13), cv=5))
])
pipe.fit(X_train, y_train)
pred_log = pipe.predict(X_test)

# back-transform
pred = np.expm1(pred_log)
y_true = df["revenue"].iloc[split:]

r2   = r2_score(y_true, pred)
mape = mean_absolute_percentage_error(y_true, pred)
rmse = np.sqrt(mean_squared_error(y_true, pred))
print({"R2": r2, "MAPE": mape, "RMSE": rmse})

coef = pipe.named_steps["ridge"].coef_
print("Coefficients:", dict(zip(X.columns, coef)))

{'R2': 0.5158619746585984, 'MAPE': 0.0793989808488486, 'RMSE': np.float64(7684.636944477699)}
Coefficients: {'log_tv_spend': np.float64(0.0017816217327150652), 'log_search_spend': np.float64(-0.0003885028128915171), 'log_social_spend': np.float64(0.006488431021336523), 'log_display_spend': np.float64(-0.011715353320564902), 'price_index': np.float64(-0.0013453296135823955), 'promo': np.float64(0.008863619839149165), 'holiday': np.float64(0.020290360476916406), 'sin52': np.float64(0.09468879136674925), 'cos52': np.float64(-0.006249738001058412)}


Before applying Google Meridian’s Bayesian MMM, I built a baseline regression model to serve as a benchmark.

**Approach**: Multiple linear regression with log-transformed media spends, seasonality controls (sin/cos), and Ridge regularization to address multicollinearity.

**Features**: log_tv_spend, log_search_spend, log_social_spend, log_display_spend, price_index, promo, holiday, sin52, cos52.

**Evaluation**: Time-aware train/test split (last 20% for testing).

**Results**:

* R² ≈ 0.52 (model explains ~52% of variance in revenue)
* MAPE ≈ 7.9% (average prediction error)
* RMSE ≈ 7.7k (revenue units)

**Insights**:

* Media spends (TV, Search, Social) positively correlated with revenue.
* Price index negative (higher prices reduce sales).
* Promotions & holidays significantly boost sales.
* Seasonality captured via weekly sine/cosine features.