# Feature Engineering

This notebook transforms cleaned Netflix Original TV metadata into analysis-ready features, informed by insights from prior exploratory analysis.


In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/netflix_tv_with_hit.csv")
df.shape

(2020, 20)

In [7]:
needed = ["id", "hit", "year", "primary_genre", "primary_country"]
missing = [c for c in needed if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected columns: {missing}")

print(df["hit"].value_counts(dropna=False))
print("Year NA:", df["year"].isna().mean())
print("Primary_genre NA:", df["primary_genre"].isna().mean())
print("Primary_country NA:", df["primary_country"].isna().mean())


hit
0    1837
1     183
Name: count, dtype: int64
Year NA: 0.060396039603960394
Primary_genre NA: 0.015841584158415842
Primary_country NA: 0.007425742574257425


### Temporal Bucketing

To capture structural shifts in content production and audience behavior, titles are grouped into pre-COVID (≤2019), COVID (2020–2021), and post-COVID (≥2022) release periods.

This feature is intended as a coarse temporal indicator rather than a causal attribution of pandemic effects.


In [None]:
AS_OF_YEAR = 2026  # fixed for reproducibility

def covid_period(year):
    if pd.isna(year):
        return "unknown"
    year = int(year)
    if year <= 2019:
        return "pre_covid"
    elif year <= 2021:
        return "covid"
    else:
        return "post_covid"

df["covid_period"] = df["year"].apply(covid_period)

df["years_since_release"] = df["year"].apply(
    lambda y: np.nan if pd.isna(y) else max(0, AS_OF_YEAR - int(y))
)

df[["year", "covid_period", "years_since_release"]].head()

Unnamed: 0,year,covid_period,years_since_release
0,2016.0,pre_covid,10.0
1,2025.0,post_covid,1.0
2,1993.0,pre_covid,33.0
3,2016.0,pre_covid,10.0
4,2019.0,pre_covid,7.0


In [13]:
df["is_us"] = (df["primary_country"] == "US").astype(int)
df["is_korea"] = (df["primary_country"] == "KR").astype(int)  # optional
df[["primary_country", "is_us", "is_korea"]].head()

Unnamed: 0,primary_country,is_us,is_korea
0,US,1,0
1,KR,0,1
2,US,1,0
3,US,1,0
4,US,1,0


In [14]:
genre_counts = df["primary_genre"].value_counts(dropna=False)
genre_counts.head(10), genre_counts.shape

(primary_genre
 Drama                 494
 Documentary           369
 Comedy                275
 Reality               242
 Animation             203
 Crime                 111
 Action & Adventure     86
 Sci-Fi & Fantasy       75
 Mystery                52
 Kids                   37
 Name: count, dtype: int64,
 (16,))

In [15]:
MIN_GENRE_COUNT = 40

common_genres = set(df["primary_genre"].value_counts().loc[lambda s: s >= MIN_GENRE_COUNT].index)

def group_genre(g):
    if pd.isna(g):
        return "unknown"
    return g if g in common_genres else "Other"

df["primary_genre_grouped"] = df["primary_genre"].apply(group_genre)
df["primary_genre_grouped"].value_counts().head(20)

primary_genre_grouped
Drama                 494
Documentary           369
Comedy                275
Reality               242
Animation             203
Crime                 111
Action & Adventure     86
Other                  81
Sci-Fi & Fantasy       75
Mystery                52
unknown                32
Name: count, dtype: int64

In [17]:
cat_cols = ["covid_period", "primary_genre_grouped", "primary_country"]
cat_cols = [c for c in cat_cols if c in df.columns]

X_cat = pd.get_dummies(df[cat_cols], dummy_na=True, drop_first=False)

num_cols = ["years_since_release"]
num_cols = [c for c in num_cols if c in df.columns]
X_num = df[num_cols].copy()

# Add safe binary flags
X_bin = df[["is_us", "is_korea"]].copy()

# Final feature table
features = pd.concat(
    [df[["id", "hit"]], X_num, X_bin, X_cat],
    axis=1
)

print(features.shape)
features.head()

(2020, 78)


Unnamed: 0,id,hit,years_since_release,is_us,is_korea,covid_period_covid,covid_period_post_covid,covid_period_pre_covid,covid_period_unknown,covid_period_nan,...,primary_country_SE,primary_country_SG,primary_country_SN,primary_country_TH,primary_country_TR,primary_country_TW,primary_country_US,primary_country_VN,primary_country_ZA,primary_country_nan
0,66732,1,10.0,1,0,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
1,238458,1,1.0,0,1,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,4656,1,33.0,1,0,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
3,63174,1,10.0,1,0,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False
4,71912,1,7.0,1,0,False,False,True,False,False,...,False,False,False,False,False,False,True,False,False,False


In [18]:
features.to_csv("../data/netflix_tv_features_v1.csv", index=False)
print("Saved: ../data/netflix_tv_features_v1.csv")

Saved: ../data/netflix_tv_features_v1.csv
