# Netflix Recommendation System
## Feature Engineering, Modeling, and A/B Testing

This notebook uses the same cleaned dataset, then builds:
- Feature engineering for content similarity
- A content based recommender (TF-IDF + cosine similarity)
- A simple collaborative filtering style baseline with simulated implicit events
- A/B testing utilities (z-test, power and sample size)


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from scipy.stats import norm

# Theme
NETFLIX_RED = "#E50914"
RED_2 = "#B20710"
DARK_BG = "#141414"
MID_GRAY = "#6D6D6D"
LIGHT_GRAY = "#B3B3B3"

plt.rcParams["figure.facecolor"] = DARK_BG
plt.rcParams["axes.facecolor"] = DARK_BG
plt.rcParams["axes.edgecolor"] = MID_GRAY
plt.rcParams["axes.labelcolor"] = LIGHT_GRAY
plt.rcParams["xtick.color"] = LIGHT_GRAY
plt.rcParams["ytick.color"] = LIGHT_GRAY
plt.rcParams["text.color"] = LIGHT_GRAY
plt.rcParams["grid.color"] = "#2A2A2A"

df = pd.read_csv(r"/Users/miriamgarcia/Desktop/NetflixRecommendationSystem/cleaned_netflix_data.csv")
df["duration"] = pd.to_numeric(df["duration"], errors="coerce")
df.head()


In [None]:
# Feature engineering for content similarity
# keep this simple and explicit so it's easy to explain in interviews
df_fe = df.copy()

for col in ["director", "cast", "country", "listed_in", "description", "title"]:
    if col in df_fe.columns:
        df_fe[col] = df_fe[col].fillna("")

df_fe["text_blob"] = (
    df_fe["title"] + " " +
    df_fe["listed_in"] + " " +
    df_fe["director"] + " " +
    df_fe["cast"] + " " +
    df_fe["country"] + " " +
    df_fe["description"]
)

# quick sanity check
df_fe[["title", "text_blob"]].head(3)


In [None]:
# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words="english", max_features=8000, ngram_range=(1,2))
X = tfidf.fit_transform(df_fe["text_blob"])

print("TF-IDF matrix:", X.shape)


In [None]:
# Cosine similarity and a recommendation function
sim = cosine_similarity(X, X)

title_to_idx = pd.Series(df_fe.index, index=df_fe["title"]).drop_duplicates()

def recommend_by_title(title, k=10):
    if title not in title_to_idx:
        return pd.DataFrame()

    idx = int(title_to_idx[title])
    scores = list(enumerate(sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)

    top = [i for i,_ in scores[1:k+1]]
    out = df_fe.loc[top, ["title", "type", "release_year", "rating", "listed_in"]].copy()
    out["similarity"] = [scores[i][1] for i in range(1, k+1)]
    return out

recommend_by_title(df_fe["title"].iloc[0], k=8)


In [None]:
# Simulate implicit events (clicks) so we can demo evaluation + A/B testing style work
# NOTE: real projects would use watch/click/impression logs
rng = np.random.default_rng(7)

n_users = 800
n_items = len(df_fe)

# simulate exposures
n_rows = 25000
events = pd.DataFrame({
    "user_id": rng.integers(0, n_users, size=n_rows),
    "item_id": rng.integers(0, n_items, size=n_rows),
})

# build a simple probability of click using popularity proxies
# small typo on purpose: popluarity
genre_pop = df_fe["listed_in"].str.split(", ").str[0].fillna("Unknown")
genre_rate = genre_pop.value_counts(normalize=True)

base = 0.06
events["p_click"] = base + 0.10 * events["item_id"].map(
    df_fe["listed_in"].fillna("").str.contains("Dramas", case=False).astype(int)
).fillna(0).astype(float)

events["p_click"] = events["p_click"].clip(0.01, 0.30)
events["clicked"] = rng.binomial(1, events["p_click"])

events.head()


In [None]:
# Build a user-item matrix (implicit feedback) and do a simple matrix factorization baseline
# We'll use TruncatedSVD on a sparse matrix to keep deps light.
from scipy.sparse import coo_matrix

rows = events["user_id"].to_numpy()
cols = events["item_id"].to_numpy()
data = events["clicked"].to_numpy().astype(float)

R = coo_matrix((data, (rows, cols)), shape=(n_users, n_items)).tocsr()

svd = TruncatedSVD(n_components=40, random_state=42)
U = svd.fit_transform(R)          # user factors
V = svd.components_.T             # item factors

# score matrix is U @ V.T, but we won't materialize full matrix for memory
print("User factors:", U.shape)
print("Item factors:", V.shape)


In [None]:
# Recommend items for a user (SVD scores)
def recommend_for_user(user_id, k=10):
    user_vec = U[user_id]                    # (factors,)
    scores = user_vec @ V.T                  # (items,)
    top_idx = np.argsort(scores)[::-1][:k]
    out = df_fe.loc[top_idx, ["title","type","release_year","rating","listed_in"]].copy()
    out["score"] = scores[top_idx]
    return out

recommend_for_user(0, k=8)


In [None]:
# Simple evaluation: hit-rate@K using a train/test split over events
# We'll hold out a subset of clicked items per user.
from collections import defaultdict

# build per-user clicked lists
clicked_items = events.loc[events["clicked"] == 1].groupby("user_id")["item_id"].apply(list).to_dict()

# sample one holdout click per user if possible
holdout = {}
train_clicked = defaultdict(set)

for u, items in clicked_items.items():
    if len(items) >= 2:
        it = int(rng.choice(items))
        holdout[u] = it
        for x in items:
            if x != it:
                train_clicked[u].add(int(x))

def hit_rate_at_k(k=10, n_eval=300):
    users = list(holdout.keys())
    rng.shuffle(users)
    users = users[:min(n_eval, len(users))]

    hits = 0
    for u in users:
        recs = recommend_for_user(u, k=k)
        rec_item_ids = recs.index.to_numpy()
        if holdout[u] in rec_item_ids:
            hits += 1
    return hits / len(users) if users else np.nan

for k in [5, 10, 20]:
    print("hit_rate@", k, "=", round(hit_rate_at_k(k=k), 4))


In [None]:
# A/B testing helpers (CTR z-test + sample size)
def z_test_proportions(clicks_a, n_a, clicks_b, n_b):
    p1 = clicks_a / n_a
    p2 = clicks_b / n_b
    p_pool = (clicks_a + clicks_b) / (n_a + n_b)
    se = np.sqrt(p_pool * (1 - p_pool) * (1/n_a + 1/n_b))
    z = (p2 - p1) / se
    p_value = 2 * (1 - norm.cdf(abs(z)))
    return p1, p2, z, p_value

def sample_size_for_mde(baseline_ctr, mde_abs, alpha=0.05, power=0.8):
    # two-sided test
    z_alpha = norm.ppf(1 - alpha/2)
    z_beta = norm.ppf(power)

    p1 = baseline_ctr
    p2 = baseline_ctr + mde_abs
    p_bar = (p1 + p2) / 2

    num = (z_alpha * np.sqrt(2*p_bar*(1-p_bar)) + z_beta * np.sqrt(p1*(1-p1) + p2*(1-p2)))**2
    den = (p2 - p1)**2
    return int(np.ceil(num / den))

# quick example
print("Sample size per group for +1% CTR lift from 8%:",
      sample_size_for_mde(0.08, 0.01))


In [None]:
# Simulate an A/B test comparing two ranking strategies
# Control: popularity (proxy)
# Treatment: content similarity boost
rng = np.random.default_rng(11)

n_users_sim = 4000
n_impressions = 40000

# simulate impressions
ab = pd.DataFrame({
    "variant": rng.choice(["control","treatment"], size=n_impressions, p=[0.5,0.5]),
})

# baseline CTR
base_ctr = 0.075

# treatment gets a small lift
ab["p_click"] = np.where(ab["variant"] == "treatment", base_ctr + 0.008, base_ctr)
ab["clicked"] = rng.binomial(1, ab["p_click"])

summary = ab.groupby("variant")["clicked"].agg(["sum","count"])
summary


In [None]:
# Run z-test
clicks_a = int(summary.loc["control","sum"])
n_a = int(summary.loc["control","count"])
clicks_b = int(summary.loc["treatment","sum"])
n_b = int(summary.loc["treatment","count"])

p1, p2, z, p_value = z_test_proportions(clicks_a, n_a, clicks_b, n_b)

print("Control CTR:", round(p1, 4))
print("Treatment CTR:", round(p2, 4))
print("z:", round(z, 3))
print("p-value:", round(p_value, 4))


In [None]:
# Visualize CTR results with Netflix colors
ctr = pd.DataFrame({
    "variant": ["control","treatment"],
    "ctr": [p1, p2]
})

plt.figure(figsize=(6,4))
sns.barplot(data=ctr, x="variant", y="ctr", palette=[MID_GRAY, NETFLIX_RED])
plt.title("A/B Test CTR (Simulated)")
plt.xlabel("")
plt.ylabel("CTR")
plt.ylim(0, max(ctr["ctr"]) * 1.35)
plt.tight_layout()
plt.show()
