In [81]:
%pip install -r requirements.txt

import warnings
warnings.filterwarnings('ignore')

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from dotenv import load_dotenv
import os

load_dotenv(".env")

URL = os.getenv("URL")
USER_ID = os.getenv("USER_ID")
# USER_ID = "d7252e24a2a34cab83e147e26fcee5d8"

API_KEY = os.getenv("API_KEY")

In [40]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
import requests

columns_to_keep = [
    "Name", "PremiereDate", "CriticRating", "OfficialRating", "Overview", "Taglines",
    "Genres", "CommunityRating", "RunTimeTicks", "ProductionYear", "People", "Studios", "UserData"
]

class JellyfinClient(BaseEstimator, TransformerMixin):
    def __init__(self, url, user_id, api_key, played_status="IsPlayed", limit=None):
        """
        Initializes the Jellyfin Client.
        :param url: Base URL of the Jellyfin server.
        :param user_id: User ID for the API request.
        :param api_key: API key for authentication.
        :param played_status: "IsPlayed" or "IsNotPlayed" to filter movies.
        """
        self.url = url
        self.user_id = user_id
        self.api_key = api_key
        self.played_status = played_status  # Switch for IsPlayed or IsNotPlayed
        self.limit = limit

    def get_amount(self):
        """Fetch the total number of movies with the current filter."""
        res = requests.get(
            f"{self.url}/emby/Users/{self.user_id}/Items"
            f"?StartIndex=0&Limit=1&Recursive=true&IncludeItemTypes=Movie"
            f"&api_key={self.api_key}&Filters={self.played_status}"
        )
        res.raise_for_status()
        return res.json().get("TotalRecordCount")

    def get_chunk(self, start, chunk):
        """Fetch a chunk of movies with the current filter."""
        res = requests.get(
            f"{self.url}/emby/Users/{self.user_id}/Items"
            f"?StartIndex={start}&Limit={chunk}&Recursive=true&IncludeItemTypes=Movie"
            f"&api_key={self.api_key}&Filters={self.played_status}"
            f"&Fields=Budget,Genres,Overview,People,Revenue,Studios,Taglines,ProviderIds,"
            f"CriticRating,OfficialRating,PremiereDate,CommunityRating,RunTimeTicks,ProductionYear,UserData"
        )
        res.raise_for_status()
        return res.json().get("Items")

    def fit(self, X, y=None):
        return self

    def transform(self, X=None):
        """Fetch all movies in chunks based on the current filter."""
        size = self.limit or self.get_amount()

        start = 0
        chunk = 50
        all_movies = []
        while start < size:
            print(
                f"Fetching movies {start} - {start + chunk} of {size} ({self.played_status})")
            all_movies.extend(self.get_chunk(start, chunk))
            start += chunk
        return all_movies


class MovieDataCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_keep):
        self.columns_to_keep = columns_to_keep

    def filter_movie(self, movie):
        """Filter and clean a single movie dictionary."""
        filtered = {key: value for key,
                    value in movie.items() if key in self.columns_to_keep}

        # Process nested People and Studios
        if "People" in filtered:
            filtered["People"] = [[person.get("Name"), person.get(
                "Type")] for person in filtered["People"]]
        if "Studios" in filtered:
            filtered["Studios"] = [studio["Name"]
                                   for studio in filtered["Studios"] if "Name" in studio]
        if "UserData" in filtered:
            filtered["IsFavorite"] = filtered["UserData"].get("IsFavorite")
            del filtered["UserData"]

        if "Taglines" in filtered:
            filtered["Taglines"] = ("\n").join(filtered["Taglines"])

        # Convert from Microseconds
        if "RunTimeTicks" in filtered:
            filtered["LengthInHours"] = filtered["RunTimeTicks"] / 10000000 / 60 / 60
            del filtered["RunTimeTicks"]
            
        return filtered

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame([self.filter_movie(movie) for movie in X])

In [41]:
data_pipeline = Pipeline([
    ("jellyfin_client", JellyfinClient(URL, USER_ID, API_KEY, played_status="IsPlayed")),
    ("data_cleaner", MovieDataCleaner(columns_to_keep)),
])

dp = data_pipeline.fit_transform(None)
# dp.columns

Fetching movies 0 - 50 of 539 (IsPlayed)
Fetching movies 50 - 100 of 539 (IsPlayed)
Fetching movies 100 - 150 of 539 (IsPlayed)
Fetching movies 150 - 200 of 539 (IsPlayed)
Fetching movies 200 - 250 of 539 (IsPlayed)
Fetching movies 250 - 300 of 539 (IsPlayed)
Fetching movies 300 - 350 of 539 (IsPlayed)
Fetching movies 350 - 400 of 539 (IsPlayed)
Fetching movies 400 - 450 of 539 (IsPlayed)
Fetching movies 450 - 500 of 539 (IsPlayed)
Fetching movies 500 - 550 of 539 (IsPlayed)


In [48]:
dp.columns
for col in dp.columns:
    print(f'Column: {col} has null: {dp[col].isnull().values.any()}')

Column: Name has null: False
Column: PremiereDate has null: False
Column: CriticRating has null: True
Column: OfficialRating has null: True
Column: Overview has null: False
Column: Taglines has null: False
Column: Genres has null: False
Column: CommunityRating has null: True
Column: ProductionYear has null: False
Column: People has null: False
Column: Studios has null: False
Column: IsFavorite has null: False
Column: LengthInHours has null: True


In [None]:
dp.iloc[0]

In [131]:
import fasttext
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, OneHotEncoder
from scipy.sparse import hstack, csr_matrix
from collections import Counter
import os

class MovieFeatureEngineerWithFastText(BaseEstimator, TransformerMixin):
    def __init__(self, vector_size=100):
        self.vector_size = vector_size
        self.fasttext_combined_model = None
        # Encoders and scalers
        self.ohe_rating = OneHotEncoder(sparse_output=True, handle_unknown='ignore')
        self.mlb_genres = MultiLabelBinarizer()
        self.scaler = MinMaxScaler()

    def load_pretrained_fasttext_model(self):
        """
        Load a pre-trained FastText model.
        """
        import fasttext.util
        fasttext.util.download_model('en', if_exists='ignore')  # Downloads Common Crawl vectors
        model = fasttext.load_model('cc.en.300.bin')
        return model

    def fit(self, X, y=None):
        """
        Fit the feature engineering pipeline.
        """
        # Fit numerical scaler
        X_filled = X[['CriticRating', 'CommunityRating', 'LengthInHours', 'ProductionYear']].fillna(-7)
        self.scaler.fit(X_filled)

        # Load the pre-trained FastText model
        self.fasttext_combined_model = self.load_pretrained_fasttext_model()

        # Fit encoders for categorical and multilabel features
        self.ohe_rating.fit(X[['OfficialRating']].fillna("unknown"))
        self.mlb_genres.fit(X['Genres'].fillna("unknown"))
        return self

    def transform(self, X):
        """
        Transform the input DataFrame into feature matrix.
        """
        df = X.copy()

        # Transform numerical features
        numerical_features = ['CriticRating', 'CommunityRating', 'LengthInHours', 'ProductionYear']
        df[numerical_features] = df[numerical_features].fillna(-7)
        scaled_numerical = csr_matrix(self.scaler.transform(df[numerical_features]))

        # Helper function for FastText embeddings
        def get_fasttext_vector(text, model):
            if isinstance(text, str) and text.strip():
                vectors = [model.get_word_vector(word) for word in text.split() if word in model.words]
                if vectors:
                    return np.mean(vectors, axis=0)
            return np.zeros(model.get_dimension())

        # Combine text data for embedding generation
        combined_text = (
            df['Name'].fillna("").astype(str) + " " +
            df['Overview'].fillna("").astype(str) + " " +
            df['People'].apply(
                lambda p: ' '.join([str(f"{item[0]}, {item[1]}") for item in p if isinstance(item, list) and len(item) > 0])
            ).fillna("") + " " +
            df['Studios'].fillna("").astype(str)
        )

        combined_embeddings = np.vstack(combined_text.apply(lambda x: get_fasttext_vector(x, self.fasttext_combined_model)))
        combined_embeddings_sparse = csr_matrix(combined_embeddings)

        # Count role occurrences
        def count_roles(people):
            role_counter = Counter()
            for person in people:
                if isinstance(person, list) and len(person) > 1:
                    role_counter[person[1]] += 1
            return role_counter

        role_counts = df['People'].apply(count_roles).fillna(Counter())
        role_features = csr_matrix(np.array([
            [
                role_counts.get('Actor', -7),
                role_counts.get('Director', -7),
                role_counts.get('Writer', -7),
                role_counts.get('Producer', -7)
            ]
            for role_counts in role_counts
        ]))

        # Date encoding
        df["PremiereDate"] = pd.to_datetime(df['PremiereDate'], errors='coerce')
        df['year'] = df['PremiereDate'].dt.year.fillna(-7).astype(int)
        df['month'] = df['PremiereDate'].dt.month.fillna(-7).astype(int)
        df['day'] = df['PremiereDate'].dt.day.fillna(-7).astype(int)
        df['day_of_week'] = df['PremiereDate'].dt.dayofweek.fillna(-7).astype(int)
        df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)
        df['week_of_year'] = df['PremiereDate'].dt.isocalendar().week.fillna(-7).astype(int)
        df['day_of_year'] = df['PremiereDate'].dt.dayofyear.fillna(-7).astype(int)
        df = df.drop(columns=['PremiereDate'])
        date_features = csr_matrix(df[['year', 'month', 'day', 'is_weekend', 'week_of_year', 'day_of_year']].values)

        # Encode categorical and multilabel features
        official_rating_encoded = self.ohe_rating.transform(df[['OfficialRating']].fillna("unknown"))
        genres_encoded = csr_matrix(self.mlb_genres.transform(df['Genres'].fillna("unknown")))

        # Stack all features together
        final_sparse_matrix = hstack([
            scaled_numerical,               # Scaled numerical data
            combined_embeddings_sparse,     # Unified FastText embeddings
            role_features,                  # Role count features
            date_features,                  # Date encoding features
            official_rating_encoded,        # One-hot encoded Official Rating
            genres_encoded                  # Multi-label binarized Genres
        ])

        return final_sparse_matrix


In [132]:
from sklearn.model_selection import train_test_split

# Full pipeline
full_pipeline = Pipeline([
    ('feature_engineer', MovieFeatureEngineerWithFastText()),
])

# Split data
df = dp.drop(columns=['IsFavorite'])
y = dp['IsFavorite']

X_train, X_test, y_train, y_test = train_test_split(
    df, y, test_size=0.2, random_state=42)

# Apply pipeline and resampling
X_train_transformed = full_pipeline.fit_transform(X_train)
X_test_transformed = full_pipeline.transform(X_test)  # Use transform instead of fit_transform for test data


In [133]:
X_train_transformed.shape

(431, 345)

In [134]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, classification_report, confusion_matrix
)
import matplotlib.pyplot as plt
from sklearn.metrics import make_scorer, recall_score
from sklearn.pipeline import Pipeline


# Hyperparameter grid
param_dist = {
    'n_estimators': [200, 500, 1000],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Model and scoring
rf = RandomForestClassifier(class_weight="balanced", random_state=42)
recall_scorer = make_scorer(recall_score)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    scoring=recall_scorer,
    n_iter=20,
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train_transformed, y_train)

# Best model
tuned_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Predict probabilities for AUC evaluation
y_pred_proba = tuned_rf.predict_proba(X_test_transformed)[:, 1]
y_pred = tuned_rf.predict(X_test_transformed)

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Print metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 10}
Evaluation Metrics:
Accuracy: 0.6204
Precision: 0.6000
Recall: 0.6792
F1 Score: 0.6372
ROC-AUC: 0.6758

Classification Report:
              precision    recall  f1-score   support

       False       0.65      0.56      0.60        55
        True       0.60      0.68      0.64        53

    accuracy                           0.62       108
   macro avg       0.62      0.62      0.62       108
weighted avg       0.62      0.62      0.62       108

Confusion Matrix:
[[31 24]
 [17 36]]


In [135]:
new_pipeline = Pipeline([
    ("jellyfin_client", JellyfinClient(URL, USER_ID, API_KEY, played_status="IsUnPlayed")),
    ("data_cleaner", MovieDataCleaner(columns_to_keep)),

])

new_df = new_pipeline.fit_transform(None)
X_new_transformed = full_pipeline.transform(new_df)

Fetching movies 0 - 50 of 2318 (IsUnPlayed)
Fetching movies 50 - 100 of 2318 (IsUnPlayed)
Fetching movies 100 - 150 of 2318 (IsUnPlayed)
Fetching movies 150 - 200 of 2318 (IsUnPlayed)
Fetching movies 200 - 250 of 2318 (IsUnPlayed)
Fetching movies 250 - 300 of 2318 (IsUnPlayed)
Fetching movies 300 - 350 of 2318 (IsUnPlayed)
Fetching movies 350 - 400 of 2318 (IsUnPlayed)
Fetching movies 400 - 450 of 2318 (IsUnPlayed)
Fetching movies 450 - 500 of 2318 (IsUnPlayed)
Fetching movies 500 - 550 of 2318 (IsUnPlayed)
Fetching movies 550 - 600 of 2318 (IsUnPlayed)
Fetching movies 600 - 650 of 2318 (IsUnPlayed)
Fetching movies 650 - 700 of 2318 (IsUnPlayed)
Fetching movies 700 - 750 of 2318 (IsUnPlayed)
Fetching movies 750 - 800 of 2318 (IsUnPlayed)
Fetching movies 800 - 850 of 2318 (IsUnPlayed)
Fetching movies 850 - 900 of 2318 (IsUnPlayed)
Fetching movies 900 - 950 of 2318 (IsUnPlayed)
Fetching movies 950 - 1000 of 2318 (IsUnPlayed)
Fetching movies 1000 - 1050 of 2318 (IsUnPlayed)
Fetching movie

In [136]:
predictions = tuned_rf.predict(X_new_transformed)
predicted_probabilities = tuned_rf.predict_proba(X_new_transformed)[:, 1]

results_df = new_df[['Name']].copy()  # or 'MovieID', if you have it
results_df['PredictedIsFavorite'] = predictions
results_df['ProbabilityIsFavorite'] = predicted_probabilities

# Top 5 highest
top_5 = results_df.nlargest(5, 'ProbabilityIsFavorite')

# Bottom 5 lowest
bottom_5 = results_df.nsmallest(5, 'ProbabilityIsFavorite')

# Combine the two lists and format the output
output = "\n".join(
    f"{row['Name']}: {row['ProbabilityIsFavorite']:.2f}"
    for _, row in pd.concat([top_5, bottom_5]).iterrows()
)

print(output)


Hundreds of Beavers: 0.74
Transformers: Rise of the Beasts: 0.73
The Crow: 0.71
The Creator: 0.69
Jeepers Creepers 3: 0.69
The Thing: 0.33
Species III: 0.33
Drag Me to Hell: 0.34
Flight: 0.34
For Your Eyes Only: 0.34
