## Feature Engineering and Modeling

### Feature Engineering

In [27]:
import pandas as pd

In [89]:
df = pd.read_csv("data/movie_processed.csv", index_col=0)
df.head()

Unnamed: 0,movie_id,movie_title,movie_info,rating,genre,directors,in_theaters_date,on_streaming_date,runtime_in_minutes,critic_rating,critic_count,audience_rating,audience_count
0,1,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,PG,"Action & Adventure, Comedy, Drama, Science Fic...",Chris Columbus,2010-02-12,2010-06-29,83.0,49,144,53.0,254287.0
1,2,Please Give,Kate has a lot on her mind. There's the ethics...,R,Comedy,Nicole Holofcener,2010-04-30,2010-10-19,90.0,86,140,64.0,11567.0
2,3,10,Blake Edwards' 10 stars Dudley Moore as George...,R,"Comedy, Romance",Blake Edwards,1979-10-05,1997-08-27,118.0,68,22,53.0,14670.0
3,4,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",,"Classics, Drama",Sidney Lumet,1957-04-13,2001-03-06,95.0,100,51,97.0,105000.0
4,5,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",G,"Action & Adventure, Drama, Kids & Family",Richard Fleischer,1954-01-01,2003-05-20,127.0,89,27,74.0,68860.0


In [None]:
# Train/Test Split
df["in_theaters_date"] = pd.to_datetime(df["in_theaters_date"], errors="coerce") # ensure datetime format

# split into train (before 2010) and test (after 2010)
train_df = df[df["in_theaters_date"].dt.year < 2010].copy()
test_df  = df[df["in_theaters_date"].dt.year >= 2010].copy()

print("Training set size:", len(train_df))
print("Test set size:", len(test_df))

Training set size: 9764
Test set size: 6059


In [None]:
# Drop critic_rating related columns to avoid leakage
leak_cols = ["critic_rating", "critic_count", "audience_rating", "audience_count", "on_streaming_date"]
train_df = train_df.drop(columns=leak_cols)
test_df = test_df.drop(columns=leak_cols)

In [None]:
# Create some new features

# movie_title, runtime_in_minutes
train_df_new = train_df[["movie_title", "runtime_in_minutes"]].copy()

# kid_friendly
train_df_new["kid_friendly"] = train_df["rating"].isin(["G", "PG"]).astype(int)

# genre dummy variables
df = train_df.copy()
df["genre_list"] = df["genre"].str.split(",").apply(
    lambda lst: [g.strip() for g in lst] if isinstance(lst, list) else []
) # split into lists of genres
df_exploded = df.explode("genre_list")
genre_dummies = pd.get_dummies(df_exploded["genre_list"], prefix="genre", dtype=int)
genre_dummies = genre_dummies.groupby(df_exploded.index).max() # combine dummies back to one row per movie
train_df_new = pd.concat([train_df_new, genre_dummies], axis=1)

train_df_new.columns

Index(['movie_title', 'runtime_in_minutes', 'kid_friendly',
       'genre_Action & Adventure', 'genre_Animation', 'genre_Anime & Manga',
       'genre_Art House & International', 'genre_Classics', 'genre_Comedy',
       'genre_Cult Movies', 'genre_Documentary', 'genre_Drama',
       'genre_Faith & Spirituality', 'genre_Gay & Lesbian', 'genre_Horror',
       'genre_Kids & Family', 'genre_Musical & Performing Arts',
       'genre_Mystery & Suspense', 'genre_Romance',
       'genre_Science Fiction & Fantasy', 'genre_Special Interest',
       'genre_Sports & Fitness', 'genre_Television', 'genre_Western'],
      dtype='object')

In [None]:
# Create more new features

# release_season
def season_from_month(m):
    if m in [12, 1, 2]: return "winter"
    if m in [3, 4, 5]:  return "spring"
    if m in [6, 7, 8]:  return "summer"
    return "fall"

release_season = train_df["in_theaters_date"].dt.month.apply(season_from_month)
season_dummies = pd.get_dummies(release_season, prefix="release_season", dtype=int)
train_df_new = pd.concat([train_df_new, season_dummies], axis=1)

# director_movies_count
director_counts = train_df["directors"].value_counts()
train_df_new["director_movies_count"] = train_df["directors"].map(director_counts).fillna(1)

# movie_genres_count
train_df_new["movie_genres_count"] = (
    train_df["genre"]
    .fillna("")
    .apply(lambda x: len([g.strip() for g in x.split(",") if g.strip() != ""]))
)

train_df_new

Unnamed: 0,movie_title,runtime_in_minutes,kid_friendly,genre_Action & Adventure,genre_Animation,genre_Anime & Manga,genre_Art House & International,genre_Classics,genre_Comedy,genre_Cult Movies,...,genre_Special Interest,genre_Sports & Fitness,genre_Television,genre_Western,release_season_fall,release_season_spring,release_season_summer,release_season_winter,director_movies_count,movie_genres_count
2,10,118.0,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,27.0,2
3,12 Angry Men (Twelve Angry Men),95.0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,29.0,2
4,"20,000 Leagues Under The Sea",127.0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,16.0,3
5,"10,000 B.C.",109.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,8.0,3
6,The 39 Steps,87.0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,36.0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16631,Zoolander,105.0,0,0,0,0,0,0,1,0,...,1,0,0,0,1,0,0,0,4.0,2
16633,Zoom,88.0,1,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,4.0,3
16635,Zorba the Greek,142.0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,1,1.0,4
16636,Zulu,139.0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,2.0,2


### Modeling