In [9]:
# import the stuff I might need
import pandas as pd
import numpy as np
import plotly.express as px

from pprint import pprint
from ast import literal_eval
import json
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler # this is important to standardize the variables in the data
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import GridSearchCV # this will look at all the different Ks and all the different accuracy metrics

In [10]:
# load in data

os.chdir("/Users/sabinesegaloff/Code/UVA_Fall_25/Intro_to_Predictive_Modeling_6021/Final_Proj/ds6021-final-project")
movies_to_model = pd.read_csv("data/movies_15_to_19.csv")
genre_df = pd.read_csv("data/movie_genre_database.csv")

In [11]:
# drop the columns I don't need
movies_to_model = movies_to_model.drop(columns=["backdrop_path", "video", "poster_path"])
movies_to_model.head(3).T

Unnamed: 0,0,1,2
Unnamed: 0,0,1,2
adult,False,False,False
genre_ids,"[10749, 18]","[16, 878, 28]","[18, 14, 53]"
id,271039,296917,312849
original_language,en,ja,tr
original_title,"Something, Anything",劇場版 PSYCHO-PASS サイコパス,Sarmaşık
overview,When a tragedy shatters her plans for domestic...,"In a futuristic Japan, the Sibyl System is cha...","After the owner's bankruptcy, the crew is stra..."
popularity,12.0941,9.3462,8.6511
release_date,2015-01-09,2015-01-09,2015-01-26
title,"Something, Anything",PSYCHO-PASS: The Movie,Ivy


### "Can we predict if a movie will be 'Highly Rated' (e.g., score > 7.0) based soley on its popularity, release year, and vote count?"

Note that at this point, I am simply dropping NaN lines.

In [12]:
# Feature Engineering
    # Create a target class: 1 if rating > 7.0, else 0
    # We'll use 7.0 as an arbitrary cutoff for a "Good" movie
movies_to_model['is_highly_rated'] = (movies_to_model['vote_average'] > 7.0).astype(int)
    # Convert release_date to datetime objects
movies_to_model['release_date'] = pd.to_datetime(movies_to_model['release_date'], errors='coerce')
    # Get the year (numeric) to use for correlation
movies_to_model['release_year'] = movies_to_model['release_date'].dt.year

# Select the features (X) and target (y)

feature_cols = ['popularity', 'vote_count', 'release_year']
X = movies_to_model[feature_cols]
y = movies_to_model['is_highly_rated']

# Split the data into train and test sets
    # stratify by y in case of class imbalance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create a Pipeline
    # This will scale the data first, then run KNN
    # recall that scaling is CRITICAL for KNN because it relies on distance and not all features are on the same scale
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=5)) # Start with K=5
])

# Fit and Evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Check Results
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Search for the best K
    # Use GridSearchCV to find the optimal number of neighbors (best K)
param_grid = {'knn__n_neighbors': range(1, 30)}
search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
search.fit(X_train, y_train)

print(f"\nBest K found: {search.best_params_['knn__n_neighbors']}")
print(f"Best Accuracy: {search.best_score_:.4f}")

Accuracy: 0.8772

Confusion Matrix:
[[26275  1001]
 [ 2752   527]]

Best K found: 28
Best Accuracy: 0.8914


In [None]:
# do it again with best k

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors=28))
])

# Fit and Evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# Check Results
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8912

Confusion Matrix:
[[27094   182]
 [ 3141   138]]

Best K found: 28
Best Accuracy: 0.8914
