# Newsletter Content â†’ Ad Matching System
### A Mini Ad Network using NLP Embeddings, CTR Prediction & Ranking

This notebook demonstrates an end-to-end Machine Learning system inspired by
real-world newsletter advertising platforms (e.g., beehiiv).

**Core components:**
- Transformer-based text embeddings
- Vector similarity search
- CTR prediction
- Multi-objective ranking
- Production-style ML pipeline


Install Dependencies

In [None]:
!pip install -q sentence-transformers faiss-cpu mlflow fastapi uvicorn

Imports

In [None]:
import os
import numpy as np
import pandas as pd
import joblib
import faiss

from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, log_loss

os.makedirs("models", exist_ok=True)



Load Articles Dataset (AG News)

In [None]:
# AG News dataset (small, clean, public)
url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/train.csv"
articles = pd.read_csv(url, header=None)

articles.columns = ["label", "title", "description"]
articles["text"] = articles["title"] + " " + articles["description"]

articles.head()


Create Ads Dataset (Synthetic but Realistic)

In [None]:
ads = pd.DataFrame({
    "ad_id": range(1, 7),
    "ad_text": [
        "Boost your startup productivity with AI tools",
        "Learn Python and Machine Learning from industry experts",
        "Travel smarter with exclusive flight deals",
        "Upgrade your home gym with smart fitness equipment",
        "Secure your business with cloud security solutions",
        "Discover healthy meal plans tailored for you"
    ],
    "historical_ctr": [0.042, 0.061, 0.033, 0.029, 0.054, 0.038]
})

ads


Load Embedding Model (Transformer)

In [None]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")



Generate Embeddings

In [None]:
ad_embeddings = embedder.encode(
    ads["ad_text"].tolist(),
    show_progress_bar=True
)

# normalize for cosine similarity
faiss.normalize_L2(ad_embeddings)

np.save("models/ad_embeddings.npy", ad_embeddings)

print("Saved: models/ad_embeddings.npy")


Build FAISS Index

In [None]:
index = faiss.IndexFlatIP(ad_embeddings.shape[1])
index.add(ad_embeddings)

faiss.write_index(index, "models/faiss.index")

print("Saved: models/faiss.index")


Generate Training Data for CTR Model

In [None]:
article_embeddings = embedder.encode(
    articles["text"].tolist(),
    show_progress_bar=True
)

X, y = [], []

for art_emb in article_embeddings:
    for i, ad_emb in enumerate(ad_embeddings):
        similarity = np.dot(art_emb, ad_emb)
        ctr = ads.iloc[i]["historical_ctr"]

        X.append([similarity, ctr])
        y.append(np.random.binomial(1, min(ctr * 10, 0.5)))  # synthetic click

X = np.array(X)
y = np.array(y)

X.shape, y.mean()


Train CTR Prediction Model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ctr_model = LogisticRegression()
ctr_model.fit(X_train, y_train)

preds = ctr_model.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, preds))
print("Log Loss:", log_loss(y_test, preds))


Save CTR Model

In [None]:
joblib.dump(ctr_model, "models/ctr_model.pkl")

print("Saved: models/ctr_model.pkl")


In [None]:
from google.colab import files

files.download("models/ad_embeddings.npy")
files.download("models/ctr_model.pkl")
files.download("models/faiss.index")
