In [1]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Collecting numpy>=1.23.2
  Downloading numpy-2.2.6-cp311-cp311-win_amd64.whl (12.9 MB)
     ---------------------------------------- 12.9/12.9 MB 1.1 MB/s eta 0:00:00
Collecting pytz>=2020.1
  Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Collecting tzdata>=2022.7
  Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.2.6 pandas-2.2.3 pytz-2025.2 tzdata-2025.2



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
!pip install matplotlib seaborn scikit-learn

Collecting matplotlib
  Downloading matplotlib-3.10.3-cp311-cp311-win_amd64.whl (8.1 MB)
     ---------------------------------------- 8.1/8.1 MB 928.9 kB/s eta 0:00:00
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-win_amd64.whl (11.1 MB)
Collecting contourpy>=1.0.1
  Downloading contourpy-1.3.2-cp311-cp311-win_amd64.whl (222 kB)
     -------------------------------------- 222.0/222.0 kB 1.0 MB/s eta 0:00:00
Collecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.58.0-cp311-cp311-win_amd64.whl (2.2 MB)
     ---------------------------------------- 2.2/2.2 MB 531.3 kB/s eta 0:00:00
Collecting kiwisolver>=1.3.1
  Downloading kiwisolver-1.4.8-cp311-cp311-win_amd64.whl (71 kB)
     ---------------------------------------- 72.0/72.0 kB 1.3 MB/s eta 0:00:00
Collecting pillow>=8
  Using cached pillow-11.2.1-cp311-cp311


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('wiki_movie_plots_deduped.csv')

# Show first rows and info
print(df.head())
print(df.info())

# Basic stats
print(f'Total movies: {len(df)}')
print(f'Columns: {df.columns.tolist()}')

   Release Year                             Title Origin/Ethnicity  \
0          1901            Kansas Saloon Smashers         American   
1          1901     Love by the Light of the Moon         American   
2          1901           The Martyred Presidents         American   
3          1901  Terrible Teddy, the Grizzly King         American   
4          1902            Jack and the Beanstalk         American   

                             Director Cast    Genre  \
0                             Unknown  NaN  unknown   
1                             Unknown  NaN  unknown   
2                             Unknown  NaN  unknown   
3                             Unknown  NaN  unknown   
4  George S. Fleming, Edwin S. Porter  NaN  unknown   

                                           Wiki Page  \
0  https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...   
1  https://en.wikipedia.org/wiki/Love_by_the_Ligh...   
2  https://en.wikipedia.org/wiki/The_Martyred_Pre...   
3  https://en.wikipedia.

In [21]:

# Check missing values
print(df.isnull().sum())

Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1422
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64


In [None]:
df[df['Genre']!='unknown']['Genre']

6                           western
7                            comedy
10                            short
11       short action/crime western
12                       short film
                    ...            
34877                    drama film
34882                        comedy
34883                        comedy
34884               romantic comedy
34885                      romantic
Name: Genre, Length: 28803, dtype: object

In [7]:
# Normalize the Genre separator
df['Genre'] = df['Genre'].str.replace(r'[\/]', ',', regex=True)
df['Genre'] = df['Genre'].str.lower().str.strip()
df['Genre_list'] = df['Genre'].apply(lambda x: [g.strip() for g in x.split(',')])


In [27]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
import re
from fuzzywuzzy import process
main_genres = [
    'action', 'adventure', 'animation', 'biography', 'comedy', 'crime',
    'documentary', 'drama', 'family', 'fantasy', 'history', 'horror',
    'musical', 'mystery', 'romance', 'sci-fi', 'sport', 'thriller',
    'war', 'western'
]


def clean_and_split_genres(raw_genre):
    if pd.isna(raw_genre):
        return []
    # Lowercase and remove brackets, special characters
    raw_genre = raw_genre.lower()
    raw_genre = re.sub(r'[\(\)\[\]{}]', '', raw_genre)
    raw_genre = re.sub(r'[^a-zA-Z0-9,& ]+', '', raw_genre)  # keep letters, commas, ampersand

    # Replace different separators with comma
    raw_genre = raw_genre.replace('&', ',').replace('/', ',')
    parts = [g.strip() for g in raw_genre.split(',')]
    return parts

# Optional manual mapping
custom_mapping = {
    'zombie horror': 'horror',
    'zombie comedy': 'comedy',
    'kung fu': 'action',
    'martial arts': 'action',
    'costume': 'history',
    'suspense': 'thriller',
    'sci fi': 'sci-fi',
    'science fiction': 'sci-fi',
    'youth': 'drama',
    'yuen lai-kei': 'unknown'
}

def map_to_main_genres(genres):
    mapped = set()
    for g in genres:
        g = g.strip()
        if g in custom_mapping:
            mapped.add(custom_mapping[g])
        else:
            match, score = process.extractOne(g, main_genres)
            if score > 80:
                mapped.add(match)
    return list(mapped)
df['genre_list_raw'] = df['Genre'].apply(clean_and_split_genres)
df['genre_list'] = df['genre_list_raw'].apply(map_to_main_genres)




In [32]:
df['Genre']

0                unknown
1                unknown
2                unknown
3                unknown
4                unknown
              ...       
34881            unknown
34882             comedy
34883             comedy
34884    romantic comedy
34885           romantic
Name: Genre, Length: 34886, dtype: object

In [31]:
df['genre_list']

0              []
1              []
2              []
3              []
4              []
           ...   
34881          []
34882    [comedy]
34883    [comedy]
34884    [comedy]
34885          []
Name: genre_list, Length: 34886, dtype: object

In [8]:
df['Cast_list'] = df['Cast'].apply(lambda x: [c.strip() for c in str(x).split(',')])


In [34]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb_genre = MultiLabelBinarizer()
y = mlb_genre.fit_transform(df['genre_list'])


In [35]:
mlb_genre.classes_

array(['action', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'history', 'horror',
       'musical', 'mystery', 'romance', 'sci-fi', 'sport', 'thriller',
       'war', 'western'], dtype=object)

In [36]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(34886, 20))

In [37]:
def clean_plot(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = re.sub(r'<[^>]+>', '', text)                     # remove HTML tags
    text = re.sub(r'[^a-z\s]', '', text)                    # remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()                # normalize whitespace
    return text

df['plot_clean'] = df['Plot'].apply(clean_plot)

In [39]:
!pip install spacy

Collecting spacy
  Downloading spacy-3.8.6-cp311-cp311-win_amd64.whl (12.2 MB)
     ---------------------------------------- 12.2/12.2 MB 2.3 MB/s eta 0:00:00
Collecting spacy-legacy<3.1.0,>=3.0.11
  Using cached spacy_legacy-3.0.12-py2.py3-none-any.whl (29 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Using cached spacy_loggers-1.0.5-py3-none-any.whl (22 kB)
Collecting murmurhash<1.1.0,>=0.28.0
  Using cached murmurhash-1.0.12-cp311-cp311-win_amd64.whl (25 kB)
Collecting cymem<2.1.0,>=2.0.2
  Using cached cymem-2.0.11-cp311-cp311-win_amd64.whl (39 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.9-cp311-cp311-win_amd64.whl (122 kB)
Collecting thinc<8.4.0,>=8.3.4
  Using cached thinc-8.3.6-cp311-cp311-win_amd64.whl (1.8 MB)
Collecting wasabi<1.2.0,>=0.9.1
  Using cached wasabi-1.1.3-py3-none-any.whl (27 kB)
Collecting srsly<3.0.0,>=2.4.3
  Using cached srsly-2.5.1-cp311-cp311-win_amd64.whl (632 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.10-py3-non


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [42]:
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 12.8/12.8 MB 2.4 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def lemmatize_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha and len(token) > 2]
    return ' '.join(tokens)

df['plot_lemmatized'] = df['plot_clean'].apply(lemmatize_text)


In [44]:
df.to_csv('wiki_movie_plots_deduped_cleaned.csv')

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf = TfidfVectorizer(
    max_features=5000,         # limit features for performance
    stop_words='english',      # remove common English stopwords
    ngram_range=(1, 2),        # unigrams and bigrams
    min_df=5,                  # ignore terms that appear in <5 documents
    max_df=0.8,                # ignore very frequent terms
    strip_accents='unicode',   # normalize accents
    sublinear_tf=True          # apply sublinear tf scaling
)

X_plot = tfidf.fit_transform(df['plot_lemmatized'])

In [47]:
y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(34886, 20))

In [51]:
from sklearn.feature_selection import SelectKBest, chi2

# Select top 1000 features based on chi-squared scores
k = 2000
selector = SelectKBest(chi2, k=k)
X_selected = selector.fit_transform(X_plot, y)  # y must be binarized multi-labels

# Get names of top features
top_feature_indices = selector.get_support(indices=True)
top_feature_names = [tfidf.get_feature_names_out()[i] for i in top_feature_indices]


In [55]:
mlb_cast = MultiLabelBinarizer()
X_cast = mlb_cast.fit_transform(df['Cast_list'])


In [56]:
X_location = pd.get_dummies(df['Origin/Ethnicity'], prefix='loc')
X_location.head()

Unnamed: 0,loc_American,loc_Assamese,loc_Australian,loc_Bangladeshi,loc_Bengali,loc_Bollywood,loc_British,loc_Canadian,loc_Chinese,loc_Egyptian,...,loc_Malayalam,loc_Malaysian,loc_Maldivian,loc_Marathi,loc_Punjabi,loc_Russian,loc_South_Korean,loc_Tamil,loc_Telugu,loc_Turkish
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [57]:
from scipy.sparse import hstack

# Ensure all are in same format (sparse)
from scipy.sparse import csr_matrix

X_combined = hstack([X_plot, csr_matrix(X_cast), csr_matrix(X_location.values)])

X_combined.shape

(34886, 35395)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2927542 stored elements and shape (27908, 35395)>

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_plot, y, test_size=0.2, random_state=42)

In [None]:


from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(LogisticRegression(max_iter=1000))

model.fit(X_train, y_train)

In [60]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=mlb_genre.classes_))


              precision    recall  f1-score   support

      action       0.54      0.13      0.21       514
   adventure       0.67      0.05      0.09       168
   animation       0.50      0.02      0.03        64
   biography       0.00      0.00      0.00        59
      comedy       0.68      0.34      0.45      1382
       crime       0.44      0.06      0.11       308
 documentary       0.00      0.00      0.00        24
       drama       0.57      0.24      0.34      1725
      family       0.00      0.00      0.00       116
     fantasy       0.60      0.07      0.12        88
     history       0.00      0.00      0.00         8
      horror       0.77      0.26      0.39       280
     musical       0.80      0.02      0.04       181
     mystery       1.00      0.01      0.02       108
     romance       0.65      0.08      0.13       372
      sci-fi       0.65      0.21      0.32       173
       sport       0.00      0.00      0.00        15
    thriller       0.30    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


=== Training a new GenrePredictor ===
Loaded dataset with 34886 movies
After filtering empty genres: 26808 movies


<genre_predictor.GenrePredictor at 0x2a10411e090>

TF-IDF features shape: (26808, 1000)
Target shape: (26808, 27)
Genre classes: [' ' "'" ',' '-' '[' ']' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'l' 'm' 'n'
 'o' 'p' 'r' 's' 't' 'u' 'v' 'w' 'y']
Location features shape: (26808, 24)
Combined features shape: (26808, 1024)
Selected features shape: (26808, 600)


ValueError: max_df corresponds to < documents than min_df

In [None]:
# Train models (using only basic and intermediate for speed)
predictor.train_models(model_levels=['basic', 'intermediate'])


In [None]:




# Save models and preprocessing components
save_info = predictor.save_models(
    dataset_name='movie_genre_predictor',
    save_all=True,
    include_data=False
)

print(f"Models and preprocessing components saved to {save_info['base_directory']}")
