In [1]:
from dotenv import load_dotenv
import os 

load_dotenv(".env")

URL = os.getenv("URL")
USER_ID = os.getenv("USER_ID")
API_KEY = os.getenv("API_KEY")

In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import requests

columns_to_keep = [
    "Name", "PremiereDate", "CriticRating", "OfficialRating", "Overview", "Taglines",
    "Genres", "CommunityRating", "RunTimeTicks", "ProductionYear", "People", "Studios", "UserData"
]

class JellyfinClient(BaseEstimator, TransformerMixin):
    def __init__(self, url, user_id, api_key, played_status="IsPlayed", limit=None):
        """
        Initializes the Jellyfin Client.
        :param url: Base URL of the Jellyfin server.
        :param user_id: User ID for the API request.
        :param api_key: API key for authentication.
        :param played_status: "IsPlayed" or "IsNotPlayed" to filter movies.
        """
        self.url = url
        self.user_id = user_id
        self.api_key = api_key
        self.played_status = played_status  # Switch for IsPlayed or IsNotPlayed
        self.limit = limit

    def get_amount(self):
        """Fetch the total number of movies with the current filter."""
        res = requests.get(
            f"{self.url}/emby/Users/{self.user_id}/Items"
            f"?StartIndex=0&Limit=1&Recursive=true&IncludeItemTypes=Movie"
            f"&api_key={self.api_key}&Filters={self.played_status}"
        )
        res.raise_for_status()
        return res.json().get("TotalRecordCount")

    def get_chunk(self, start, chunk):
        """Fetch a chunk of movies with the current filter."""
        res = requests.get(
            f"{self.url}/emby/Users/{self.user_id}/Items"
            f"?StartIndex={start}&Limit={chunk}&Recursive=true&IncludeItemTypes=Movie"
            f"&api_key={self.api_key}&Filters={self.played_status}"
            f"&Fields=Budget,Genres,Overview,People,Revenue,Studios,Taglines,ProviderIds,"
            f"CriticRating,OfficialRating,PremiereDate,CommunityRating,RunTimeTicks,ProductionYear,UserData"
        )
        res.raise_for_status()
        return res.json().get("Items")

    def fit(self, X, y=None):
        return self

    def transform(self, X=None):
        """Fetch all movies in chunks based on the current filter."""
        size = self.limit or self.get_amount()

        start = 0
        chunk = 20
        all_movies = []
        while start < size:
            print(
                f"Fetching movies {start} - {start + chunk} of {size} ({self.played_status})")
            all_movies.extend(self.get_chunk(start, chunk))
            start += chunk
        return all_movies


class MovieDataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_keep):
        self.columns_to_keep = columns_to_keep

    def filter_movie(self, movie):
        """Filter and clean a single movie dictionary."""
        filtered = {key: value for key,
                    value in movie.items() if key in self.columns_to_keep}

        # Process nested People and Studios
        if "People" in filtered:
            filtered["People"] = [[person.get("Id"), person.get(
                "Type")] for person in filtered["People"]]
        if "Studios" in filtered:
            filtered["Studios"] = [studio["Id"]
                                   for studio in filtered["Studios"] if "Name" in studio]
        if "UserData" in filtered:
            filtered["IsFavorite"] = filtered["UserData"].get("IsFavorite")
            del filtered["UserData"]

        if "Taglines" in filtered:
            filtered["Taglines"] = ("\n").join(filtered["Taglines"])

        return filtered

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame([self.filter_movie(movie) for movie in X])

In [3]:
data_pipeline = Pipeline([
    ("jellyfin_client", JellyfinClient(URL, USER_ID, API_KEY, played_status="IsPlayed")),
    ("data_cleaner", MovieDataCleaner(columns_to_keep)),
])

dp = data_pipeline.fit_transform(None)
dp.columns

Fetching movies 0 - 20 of 496 (IsPlayed)
Fetching movies 20 - 40 of 496 (IsPlayed)
Fetching movies 40 - 60 of 496 (IsPlayed)
Fetching movies 60 - 80 of 496 (IsPlayed)
Fetching movies 80 - 100 of 496 (IsPlayed)
Fetching movies 100 - 120 of 496 (IsPlayed)
Fetching movies 120 - 140 of 496 (IsPlayed)
Fetching movies 140 - 160 of 496 (IsPlayed)
Fetching movies 160 - 180 of 496 (IsPlayed)
Fetching movies 180 - 200 of 496 (IsPlayed)
Fetching movies 200 - 220 of 496 (IsPlayed)
Fetching movies 220 - 240 of 496 (IsPlayed)
Fetching movies 240 - 260 of 496 (IsPlayed)
Fetching movies 260 - 280 of 496 (IsPlayed)
Fetching movies 280 - 300 of 496 (IsPlayed)
Fetching movies 300 - 320 of 496 (IsPlayed)
Fetching movies 320 - 340 of 496 (IsPlayed)
Fetching movies 340 - 360 of 496 (IsPlayed)
Fetching movies 360 - 380 of 496 (IsPlayed)
Fetching movies 380 - 400 of 496 (IsPlayed)
Fetching movies 400 - 420 of 496 (IsPlayed)
Fetching movies 420 - 440 of 496 (IsPlayed)
Fetching movies 440 - 460 of 496 (IsPlayed

Index(['Name', 'PremiereDate', 'CriticRating', 'OfficialRating', 'Overview',
       'Taglines', 'Genres', 'CommunityRating', 'RunTimeTicks',
       'ProductionYear', 'People', 'Studios', 'IsFavorite'],
      dtype='object')

In [4]:
dp.columns
for col in dp.columns: 
    print(f'Column: {col} has null: {dp[col].isnull().values.any()}')

Column: Name has null: False
Column: PremiereDate has null: False
Column: CriticRating has null: True
Column: OfficialRating has null: True
Column: Overview has null: False
Column: Taglines has null: False
Column: Genres has null: False
Column: CommunityRating has null: False
Column: RunTimeTicks has null: True
Column: ProductionYear has null: False
Column: People has null: False
Column: Studios has null: False
Column: IsFavorite has null: False


In [9]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix
from collections import Counter
import numpy as np

class MovieFeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Text vectorizers for sparse data
        self.tfidf_name = TfidfVectorizer(max_features=500, stop_words='english')
        self.tfidf_overview = TfidfVectorizer(max_features=500, stop_words='english')
        self.tfidf_taglines = TfidfVectorizer(max_features=200, stop_words='english')
        self.tfidf_people = TfidfVectorizer(max_features=1000, stop_words='english')
        
        # Encoders and scalers
        self.ohe_rating = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
        self.mlb_genres = MultiLabelBinarizer()
        self.mlb_studios = MultiLabelBinarizer()
        self.scaler = MinMaxScaler()

    def fit(self, X, y=None):
        # Fit numerical scaler
        X_filled = X[['CriticRating', 'CommunityRating', 'RunTimeTicks', 'ProductionYear']].fillna(-7)
        self.scaler.fit(X_filled)

        # Fit text TF-IDF vectorizers
        self.tfidf_name.fit(X['Name'].fillna("unknown"))
        self.tfidf_overview.fit(X['Overview'].fillna("unknown"))
        self.tfidf_taglines.fit(X['Taglines'].fillna("unknown"))
        self.tfidf_people.fit(X['People'].apply(lambda p: ' '.join([item[0] for item in p if isinstance(item, list)])).fillna("unknown"))

        # Fit encoders for categorical and multilabel features
        self.ohe_rating.fit(X[['OfficialRating']].fillna("unknown"))
        self.mlb_genres.fit(X['Genres'].fillna("unknown"))
        self.mlb_studios.fit(X['Studios'].fillna("unknown"))
        return self

    def transform(self, X):
        df = X.copy()
        
        # Transform numerical features
        numerical_features = ['CriticRating', 'CommunityRating', 'RunTimeTicks', 'ProductionYear']
        df[numerical_features] = df[numerical_features].fillna(-7)
        scaled_numerical = csr_matrix(self.scaler.transform(df[numerical_features]))

        # Transform text features to sparse matrices
        name_tfidf = self.tfidf_name.transform(df['Name'].fillna("unknown"))
        overview_tfidf = self.tfidf_overview.transform(df['Overview'].fillna("unknown"))
        taglines_tfidf = self.tfidf_taglines.transform(df['Taglines'].fillna("unknown"))
        people_tfidf = self.tfidf_people.transform(
            df['People'].apply(lambda p: ' '.join([item[0] for item in p if isinstance(item, list)])).fillna("unknown")
        )

        # Count role occurrences
        def count_roles(people):
            role_counter = Counter()
            for person in people:
                if isinstance(person, list) and len(person) > 1:
                    role_counter[person[1]] += 1
            return role_counter

        role_counts = df['People'].apply(count_roles)
        role_features = csr_matrix(np.array([
            [role_counts.get('Actor', -7),
            role_counts.get('Director', -7),
            role_counts.get('Writer', -7),
            role_counts.get('Producer', -7)]
            for role_counts in role_counts
        ]))

        # Date encoding
        df["PremiereDate"] = pd.to_datetime(df['PremiereDate'])
        df['year'] = df['PremiereDate'].dt.year.astype(int)
        df['month'] = df['PremiereDate'].dt.month.astype(int)
        df['day'] = df['PremiereDate'].dt.day.astype(int)
        df['day_of_week'] = df['PremiereDate'].dt.dayofweek.astype(int)
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['quarter'] = df['PremiereDate'].dt.quarter.astype(int)
        df['week_of_year'] = df['PremiereDate'].dt.isocalendar().week.astype(int)
        df['day_of_year'] = df['PremiereDate'].dt.dayofyear.astype(int)
        df = df.drop(columns=['PremiereDate'])
        date_features = csr_matrix(df[['year', 'month', 'day', 'day_of_week', 'is_weekend', 'quarter', 'week_of_year', 'day_of_year']].values)

        # Encode categorical and multilabel features
        official_rating_encoded = self.ohe_rating.transform(df[['OfficialRating']].fillna("unknown"))
        genres_encoded = csr_matrix(self.mlb_genres.transform(df['Genres'].fillna("unknown")))
        studios_encoded = csr_matrix(self.mlb_studios.transform(df['Studios'].fillna("unknown")))

        # Stack all sparse matrices together (ensuring everything is sparse)
        final_sparse_matrix = hstack([
            scaled_numerical,  # Sparse numerical data
            name_tfidf,        # Sparse TF-IDF data
            overview_tfidf,    # Sparse TF-IDF data
            taglines_tfidf,    # Sparse TF-IDF data
            people_tfidf,      # Sparse TF-IDF data
            role_features,     # Sparse role count features
            date_features,     # Sparse date encoding features
            official_rating_encoded,  # Sparse one-hot encoded data
            genres_encoded,    # Sparse multilabel data
            studios_encoded    # Sparse multilabel data
        ])

        return final_sparse_matrix


In [17]:
feature_pipeline = Pipeline([
    ("feature_engineer", MovieFeatureEngineer())
])

fp = feature_pipeline.fit_transform(dp)
df = fp

df.shape

(496, 2910)

In [14]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, recall_score
from sklearn.pipeline import Pipeline

# Full pipeline
full_pipeline = Pipeline([
    ('feature_engineer', MovieFeatureEngineer()),
])

# Split data
df = dp.drop(columns=['IsFavorite'])
y = dp['IsFavorite']

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42)

# Apply pipeline and resampling
X_train_transformed = full_pipeline.fit_transform(X_train)
X_test_transformed = full_pipeline.transform(X_test)  # Use transform instead of fit_transform for test data

# Hyperparameter grid
param_dist = {
    'n_estimators': [200, 500, 1000, 2000, 4000],          # Number of trees in the forest
    'max_depth': [10, 20, 50, None],           # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],           # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4],             # Minimum samples required at each leaf node
    'max_features': ['sqrt', 'log2', None],    # Number of features considered for the best split
    'bootstrap': [True, False],                # Use bootstrap samples
    'criterion': ['gini', 'entropy'],          # Split quality criterion
}

# Model and scoring
rf = RandomForestClassifier(class_weight="balanced", random_state=42)
recall_scorer = make_scorer(recall_score)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    scoring=recall_scorer,
    n_iter=20,
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_transformed, y_train)

# Best model
tuned_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Predict probabilities for AUC evaluation
y_pred_proba = tuned_rf.predict_proba(X_test_transformed)[:, 1]
y_pred = tuned_rf.predict(X_test_transformed)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))



Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END bootstrap=True, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=True, criterion=entropy, max_depth=50, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.3s
[CV] END bootstrap=False, criterion=gini, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.2s
[CV] END b