In [3]:

import pandas as pd
from genre_predictor import GenrePredictor


In [2]:
"""
Example of training a new GenrePredictor and saving it.
"""
print("=== Training a new GenrePredictor ===")

# Create a new GenrePredictor
predictor = GenrePredictor(random_state=42, models_dir='saved_models')

# Load and prepare data
predictor.load_data(
    file_path='wiki_movie_plots_deduped_cleaned.csv',
    plot_col='plot_lemmatized',
    genre_col='genre_list',
    location_col='Origin/Ethnicity'
)

=== Training a new GenrePredictor ===
Loaded dataset with 34886 movies
After filtering empty genres: 26808 movies


<genre_predictor.GenrePredictor at 0x221eabc1c90>

In [3]:
# Prepare features
predictor.prepare_features()


TF-IDF features shape: (26808, 300)
Target shape: (26808, 20)
Genre classes: ['action' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'history' 'horror' 'musical'
 'mystery' 'romance' 'sci-fi' 'sport' 'thriller' 'war' 'western']
Location features shape: (26808, 24)
Combined features shape: (26808, 324)
Selected features shape: (26808, 100)


<genre_predictor.GenrePredictor at 0x221eabc1c90>

In [4]:
# Train models (using only basic and intermediate for speed)
predictor.train_models(model_levels=['basic'])
    

Ading basic models...
Evaluating Logistic Regression...
Evaluating Multinomial Naive Bayes...
Evaluating Decision Tree...
Evaluation complete. Best model: Decision Tree (F1-micro: 0.3028)
Model Performance Summary:
                     Model  F1-micro  F1-macro  F1-weighted  Hamming Loss
0            Decision Tree  0.302825  0.182258     0.302741      0.082395
1      Logistic Regression  0.250835  0.123547     0.238614      0.056481
2  Multinomial Naive Bayes  0.088626  0.026326     0.079432      0.058877


<genre_predictor.GenrePredictor at 0x221eabc1c90>

In [5]:
# Save models and preprocessing components
save_info = predictor.save_models(
    dataset_name='movie_genre_predictor',
    save_all=True,
    include_data=False
)

print(f"Models and preprocessing components saved to {save_info['base_directory']}")


Preprocessing components saved to saved_models\movie_genre_predictor_preprocessing_20250521_015606
Model Performance Summary:
                     Model  F1-micro  F1-macro  F1-weighted  Hamming Loss
0            Decision Tree  0.302825  0.182258     0.302741      0.082395
1      Logistic Regression  0.250835  0.123547     0.238614      0.056481
2  Multinomial Naive Bayes  0.088626  0.026326     0.079432      0.058877
All models saved successfully to saved_models\movie_genre_predictor_20250521_015606
Models and preprocessing components saved to saved_models\movie_genre_predictor_20250521_015606


# Loading and Predict


In [4]:
print("\n=== Loading a saved GenrePredictor ===")

# Load the saved predictor
predictor = GenrePredictor.load(models_dir='saved_models')

print(f"Loaded predictor with best model: {predictor.evaluator.best_model_name}")



=== Loading a saved GenrePredictor ===
Preprocessing components loaded from saved_models\movie_genre_predictor_preprocessing_20250521_015606
Successfully loaded models from saved_models\movie_genre_predictor_20250521_015606
Loaded predictor with best model: Decision Tree


In [5]:
# Create some example movie data
example_movies = [
    {
        'plot_lemmatized': "A superhero with extraordinary powers fights against an evil villain to save the world from destruction.",
        'Origin/Ethnicity': "American"
    },
    {
        'plot_lemmatized': "Two people meet and fall in love despite their different backgrounds and families who disapprove of their relationship.",
        'Origin/Ethnicity': "British"
    },
    {
        'plot_lemmatized': "A detective investigates a series of mysterious murders in a small town, uncovering dark secrets about the residents.",
        'Origin/Ethnicity': "French"
    },
    {
        'plot_lemmatized': "Astronauts travel to a distant planet where they discover an alien civilization and must find a way to communicate with them.",
        'Origin/Ethnicity': "American"
    }
]

In [6]:
# Convert to DataFrame
example_df = pd.DataFrame(example_movies)
example_df

Unnamed: 0,plot_lemmatized,Origin/Ethnicity
0,A superhero with extraordinary powers fights a...,American
1,Two people meet and fall in love despite their...,British
2,A detective investigates a series of mysteriou...,French
3,Astronauts travel to a distant planet where th...,American


In [7]:

# Make predictions
print("\nMaking predictions on example movies:")
predictions = predictor.predict(example_df)



Making predictions on example movies:


In [8]:
predictions

[['fantasy', 'romance'], ['drama', 'romance'], ['comedy', 'drama'], ['action']]

In [11]:
# Display results
for i, (movie, genres) in enumerate(zip(example_movies, predictions)):
    print(f"\nMovie {i+1}:")
    print(f"Plot: {movie['plot_lemmatized'][:100]}...")
    print(f"Origin: {movie['Origin/Ethnicity']}")
    print(f"Predicted genres: {genres}")



Movie 1:
Plot: A superhero with extraordinary powers fights against an evil villain to save the world from destruct...
Origin: American
Predicted genres: ['fantasy', 'romance']

Movie 2:
Plot: Two people meet and fall in love despite their different backgrounds and families who disapprove of ...
Origin: British
Predicted genres: ['drama', 'romance']

Movie 3:
Plot: A detective investigates a series of mysterious murders in a small town, uncovering dark secrets abo...
Origin: French
Predicted genres: ['comedy', 'drama']

Movie 4:
Plot: Astronauts travel to a distant planet where they discover an alien civilization and must find a way ...
Origin: American
Predicted genres: ['action']


In [12]:
print(predictor.df)

None


In [13]:
from sklearn.preprocessing import MultiLabelBinarizer


mlb = MultiLabelBinarizer()
df = pd.read_csv('wiki_movie_plots_deduped_cleaned.csv')

In [14]:

y = mlb.fit_transform(df['genre_list'])

In [16]:
y.shape

(34886, 27)

In [17]:
mlb.classes_

array([' ', "'", ',', '-', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
       'h', 'i', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w',
       'y'], dtype=object)

In [18]:
df['genre_list']

0                []
1                []
2                []
3                []
4                []
            ...    
34881            []
34882    ['comedy']
34883    ['comedy']
34884    ['comedy']
34885            []
Name: genre_list, Length: 34886, dtype: object

In [2]:
"""
Create a scikit-learn pipeline for preprocessing new data.

This pipeline will transform raw text and location data into the same
feature space used for training.
"""


from scipy.sparse import hstack, csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline


class LocationTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, location_categories):
        self.location_categories = location_categories

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_loc = pd.get_dummies(X, prefix='loc')

        # Ensure all expected categories are present
        for cat in self.location_categories:
            if cat not in X_loc.columns:
                X_loc[cat] = 0

        # Ensure only the expected categories are used, in the same order
        X_loc = X_loc[self.location_categories]

        # 🛠 Convert to float32 or float64 before passing to csr_matrix
        return csr_matrix(X_loc.values.astype('float32'))
tfidf = TfidfVectorizer(
            max_features=300,
            stop_words='english',
            ngram_range=(1, 2),
            strip_accents='unicode',
            sublinear_tf=True
        )

mlb = MultiLabelBinarizer()
selector = SelectKBest(chi2, k=100)

# Use ColumnTransformer to apply transformers to specific columns
preprocessor = ColumnTransformer([
    ('plot_tfidf', tfidf, 'plot'),
    ('location', LocationTransformer([1,2,3,4]), 'origin')
])

# Final pipeline
pipeline = Pipeline([
    ('features', preprocessor),
    ('selector', selector)
])
pipeline

In [11]:
# Compare predictions from different genre prediction models on a large test sample


In [12]:
df = pd.read_csv('wiki_movie_plots_deduped_cleaned.csv')

predictor = GenrePredictor.load(models_dir='saved_models')
predictor.predict(df.head(10))

Preprocessing components loaded from saved_models\movie_genre_predictor_preprocessing_20250521_015606
Successfully loaded models from saved_models\movie_genre_predictor_20250521_015606


[['comedy'],
 ['comedy'],
 ['comedy'],
 ['comedy', 'sci-fi'],
 [],
 ['comedy'],
 ['western'],
 ['comedy'],
 [],
 []]

In [13]:
predictor.evaluator.models

{'Logistic Regression': OneVsRestClassifier(estimator=LogisticRegression(max_iter=1000, n_jobs=-1,
                                                  random_state=42)),
 'Multinomial Naive Bayes': OneVsRestClassifier(estimator=MultinomialNB()),
 'Decision Tree': OneVsRestClassifier(estimator=DecisionTreeClassifier(random_state=42))}