In [1]:
from dotenv import load_dotenv

load_dotenv(".env")

True

In [2]:
import requests
import os 

URL = os.getenv("URL") 
USER_ID = os.getenv("USER_ID")
API_KEY = os.getenv("API_KEY")

def get_amount() -> int: 
    """
    Gets the amount of Movie items in user access 
    
    Returns integer amoutn 
    """
    res = requests.get(f'{URL}/emby/Users/{USER_ID}/Items?StartIndex=0&Limit=1&Recursive=true&IncludeItemTypes=Movie&api_key={API_KEY}&Filters=IsPlayed&Fields=Budget,Genres,Overview,People,Revenue,Studios,Taglines,ProviderIds')
    res.raise_for_status()

    return res.json().get("TotalRecordCount")

def get_chunk(start: int = 0, chunk: int = 20) -> list: 
    """
    Gets a chunk of movie information from Jellyfin
    
    Optional starting point and chunk size 

    Returns a List of dictionaries with movie info
    """
    res = requests.get(f'{URL}/emby/Users/{USER_ID}/Items?StartIndex={start}&Limit={chunk}&Recursive=true&IncludeItemTypes=Movie&api_key={API_KEY}&Filters=IsPlayed&Fields=Budget,Genres,Overview,People,Revenue,Studios,Taglines,ProviderIds')
    res.raise_for_status()

    return res.json().get("Items")

all_movies = list() 

size = get_amount()
start = 0 
chunk = 20 

while start < size: 
    print(f'Getting movies {start} - {start + chunk} of {size}')
    all_movies.extend(get_chunk(start=start, chunk=chunk))
    start += chunk 

Getting movies 0 - 20 of 496
Getting movies 20 - 40 of 496
Getting movies 40 - 60 of 496
Getting movies 60 - 80 of 496
Getting movies 80 - 100 of 496
Getting movies 100 - 120 of 496
Getting movies 120 - 140 of 496
Getting movies 140 - 160 of 496
Getting movies 160 - 180 of 496
Getting movies 180 - 200 of 496
Getting movies 200 - 220 of 496
Getting movies 220 - 240 of 496
Getting movies 240 - 260 of 496
Getting movies 260 - 280 of 496
Getting movies 280 - 300 of 496
Getting movies 300 - 320 of 496
Getting movies 320 - 340 of 496
Getting movies 340 - 360 of 496
Getting movies 360 - 380 of 496
Getting movies 380 - 400 of 496
Getting movies 400 - 420 of 496
Getting movies 420 - 440 of 496
Getting movies 440 - 460 of 496
Getting movies 460 - 480 of 496
Getting movies 480 - 500 of 496


In [9]:
# Columns to keep
columns_to_keep = [
    "Name",
    "PremiereDate",
    "CriticRating",
    "OfficialRating",
    "Overview",
    "Taglines",
    "Genres",
    "CommunityRating",
    "RunTimeTicks",
    "ProductionYear",
    "People",
    "Studios",
    "UserData",
]

# Function to filter a single movie dictionary
def filter_movie(movie):
    filtered = {key: value for key, value in movie.items() if key in columns_to_keep}

    # Filter nested People
    if "People" in filtered:
        filtered["People"] = [
            [person.get("Id"), person.get("Type")] for person in filtered["People"]
        ]

    # Filter Studios (keep only 'Name')
    if "Studios" in filtered:
        filtered["Studios"] = [studio["Id"] for studio in filtered["Studios"] if "Name" in studio]

    # Filter UserData (keep only 'IsFavorite')
    if "UserData" in filtered:
        filtered["IsFavorite"] = filtered["UserData"].get("IsFavorite")
        del filtered["UserData"]

    if "Taglines" in filtered: 
        filtered["Taglines"] = ("\n").join(filtered["Taglines"])
    return filtered

# Process all movies
filtered_movies = [filter_movie(movie) for movie in all_movies]

In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.DataFrame(filtered_movies)

"""
Binary Encoding
"""
df['IsFavorite'] = df['IsFavorite'].astype(int)

"""
Date Encoding
"""
df["PremiereDate"] = pd.to_datetime(df['PremiereDate'])

df['year'] = df['PremiereDate'].dt.year
df['month'] = df['PremiereDate'].dt.month
df['day'] = df['PremiereDate'].dt.day
df['day_of_week'] = df['PremiereDate'].dt.dayofweek
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
df['quarter'] = df['PremiereDate'].dt.quarter
df['week_of_year'] = df['PremiereDate'].dt.isocalendar().week
df['day_of_year'] = df['PremiereDate'].dt.dayofyear

df = df.drop(columns=['PremiereDate'])

"""
String TF-IDF Vectorization
"""
# Name
tfidf_name = TfidfVectorizer(max_features=50, stop_words='english')
name_tfidf = tfidf_name.fit_transform(df['Name'].fillna(""))

tfidf_name_columns = [f"Name_TFIDF_{word}" for word in tfidf_name.get_feature_names_out()]
df_tfidf_name = pd.DataFrame(name_tfidf.toarray(), columns=tfidf_name_columns)
df = pd.concat([df, df_tfidf_name], axis=1)
df = df.drop(columns=['Name'])

# Overview
tfidf_overview = TfidfVectorizer(max_features=50, stop_words='english')
overview_tfidf = tfidf_overview.fit_transform(df['Overview'].fillna(""))

tfidf_overview_columns = [f"Overview_TFIDF_{word}" for word in tfidf_overview.get_feature_names_out()]
df_tfidf_overview = pd.DataFrame(overview_tfidf.toarray(), columns=tfidf_overview_columns)
df = pd.concat([df, df_tfidf_overview], axis=1)
df = df.drop(columns=['Overview'])

# Taglines
tfidf_taglines = TfidfVectorizer(max_features=20, stop_words='english')
taglines_tfidf = tfidf_taglines.fit_transform(df['Taglines'].fillna(""))

tfidf_tagline_columns = [f"Tagline_TFIDF_{word}" for word in tfidf_taglines.get_feature_names_out()]
df_tfidf_taglines = pd.DataFrame(taglines_tfidf.toarray(), columns=tfidf_tagline_columns)
df = pd.concat([df, df_tfidf_taglines], axis=1)
df = df.drop(columns=['Taglines'])

"""
One-Hot Encoding Labels
"""
# One-Hot Encode CommunityRating
ohe = OneHotEncoder(sparse_output=False)
ratings_encoded = ohe.fit_transform(df[['OfficialRating']])
ratings_columns = [f"OfficialRating_{cat}" for cat in ohe.categories_[0]]
df_ratings = pd.DataFrame(ratings_encoded, columns=ratings_columns)

# Concatenate to main DataFrame
df = pd.concat([df, df_ratings], axis=1)
df = df.drop(columns=['OfficialRating'])

# Genres
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(df['Genres'])

genre_columns = [f"Genre_{g}" for g in mlb_genres.classes_]
df_genres = pd.DataFrame(genres_encoded, columns=genre_columns)
df = pd.concat([df, df_genres], axis=1)
df = df.drop(columns=['Genres'])

# Studios
mlb_studios = MultiLabelBinarizer()
studios_encoded = mlb_studios.fit_transform(df['Studios'])

studio_columns = [f"Studio_{s}" for s in mlb_studios.classes_]
df_studios = pd.DataFrame(studios_encoded, columns=studio_columns)
df = pd.concat([df, df_studios], axis=1)
df = df.drop(columns=['Studios'])

"""
People Features - Role-Based Aggregation
"""
from collections import Counter

# Function to count roles in the People column
def count_roles(people):
    roles = [p[1] for p in people]
    return dict(Counter(roles))

# Apply role counting
role_counts = df['People'].apply(count_roles)

df['Actor_Count'] = role_counts.apply(lambda x: x.get('Actor', 0))
df['Director_Count'] = role_counts.apply(lambda x: x.get('Director', 0))
df['Writer_Count'] = role_counts.apply(lambda x: x.get('Writer', 0))
df['Producer_Count'] = role_counts.apply(lambda x: x.get('Producer', 0))

# One-Hot Encode Roles while retaining hashed IDs
people_expanded = []

for i, movie in df.iterrows():
    for person_id, role in movie['People']:
        people_expanded.append({'MovieIndex': i, 'PersonId': person_id, 'Role': role})

people_df = pd.DataFrame(people_expanded)

# One-hot encode the Role column
people_df = pd.get_dummies(people_df, columns=['Role'], prefix='Role')

# Aggregate by MovieIndex to count unique PersonIds per role
role_columns = [col for col in people_df.columns if 'Role_' in col]
people_counts = people_df.groupby('MovieIndex')[role_columns].sum()

# Merge back to main dataframe
df = df.join(people_counts)
df = df.drop(columns=['People'])

"""
Numerical Normalization
"""
numerical_features = ['CriticRating', 'CommunityRating', 'RunTimeTicks', 'ProductionYear']

# MinMaxScaler for numerical columns
scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

"""
Final Clean-up
"""
# Fill any remaining NaNs with 0 (optional)
df = df.fillna(0)

# Final DataFrame
print(df.head())

   CriticRating  CommunityRating  RunTimeTicks  ProductionYear  IsFavorite  \
0          0.89         0.592312      0.462071        0.932584           0   
1          0.13         0.261701      0.372514        0.820225           0   
2          0.31         0.381398      0.529278        0.876404           1   
3          0.97         0.701255      0.273291        0.505618           1   
4          0.00         0.654850      0.473029        0.988764           0   

   year  month  day  day_of_week  is_weekend  ...  \
0  2018     11    9            4           0  ...   
1  2008      7    1            1           0  ...   
2  2013      7    3            2           0  ...   
3  1980      7    1            1           0  ...   
4  2023      3   23            3           0  ...   

   Studio_ff8fcbfc0a663a60072cbb254ef98b83  \
0                                        0   
1                                        0   
2                                        0   
3                           

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

# Features and target
X = df.drop(columns=['IsFavorite'])  # Drop target column
y = df['IsFavorite']                # Target column

# Ensure all NaNs are handled
X = X.fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional scaling for numerical features (Random Forest doesn't require scaling but good practice)
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [57]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, recall_score

# Define the parameter grid
param_dist = {
    'n_estimators': [200, 500, 1000, 2000, 4000],       # Number of trees
    'max_depth': [10, 20, 50, None],        # Tree depth
    'min_samples_split': [2, 5, 10],        # Min samples to split
    'min_samples_leaf': [1, 2, 4],          # Min samples at leaf
    'max_features': ['sqrt', 'log2', None], # Features per split
    'bootstrap': [True, False],              # Bootstrap sampling
}

# Initialize Random Forest
rf = RandomForestClassifier(class_weight="balanced", random_state=42)

recall_scorer = make_scorer(recall_score)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    scoring=recall_scorer,
    n_iter=20,
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42

)

# Fit the RandomizedSearch model
random_search.fit(X_train, y_train)

# Best Parameters
print("Best Parameters:", random_search.best_params_)

# Predict using the best model
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   2.3s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_estimators=2000; total time=   2.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=   2.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=   2.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=   2.4s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=2000; total time=   2.5s
[CV] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=5, n_est

In [67]:
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    roc_curve, 
    classification_report, 
    confusion_matrix
)

# Predict probabilities for AUC calculation
y_pred = best_rf.predict(X_test)
y_pred_proba = best_rf.predict_proba(X_test)[:, 1]  # Probabilities for the positive class

# Metrics Calculation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

# Print metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC: {auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print Model Parameters
print("\nRandom Forest Parameters:")
print(best_rf.get_params())


Evaluation Metrics:
Accuracy: 0.7100
Precision: 0.6842
Recall: 0.3611
F1 Score: 0.4727
AUC: 0.7257

Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.91      0.80        64
           1       0.68      0.36      0.47        36

    accuracy                           0.71       100
   macro avg       0.70      0.63      0.64       100
weighted avg       0.70      0.71      0.68       100

Confusion Matrix:
[[58  6]
 [23 13]]

Random Forest Parameters:
{'bootstrap': False, 'ccp_alpha': 0.0, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 50, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 4, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 4000, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
