In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/Meiji/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/Meiji/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/Meiji/nltk_data...


True

# Load Data

In [3]:
wiki_movie_plots = pd.read_csv('wiki_movie_plots_deduped.csv')
wiki_movie_plots.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


In [4]:
all_genres = list(wiki_movie_plots['Genre'].unique())
print('>> Number of raw genres:', len(all_genres))
print('>> Number of raw rows:', wiki_movie_plots.shape[0])

>> Number of raw genres: 2265
>> Number of raw rows: 34886


# Preprocessing genres

In [5]:
ALLOWED_GENRES = {'action', 'adventure', 'animated', 'biographical', 'comedy', 'crime',
                  'drama', 'fantasy', 'history', 'horror', 'music', 'mystery',
                  'romance', 'scifi', 'sport', 'thriller', 'war', 'western', 'documentary'}

In [6]:
def preprocess_genres(genre_string):
    
    """
    Processes a single genre string by first splitting it on non-hyphen non-alphanumeric characters, applying
    predefined replacements to standardize genre names, splitting again by hyphens, and then filtering genres
    against a predefined list. Any genre not in the allowed list is classified as "other".

    Parameters:
    - genre_string (str): A string containing multiple genre descriptions which may include separators like commas or slashes.

    Returns:
    - str: A comma-separated sorted list of unique genre names standardized according to a predefined list of allowed genres.
            Unrecognized genres are labeled as "other".

    This function ensures that genre names are consistent and categorized, facilitating easier analysis and usage in
    data processing tasks.

    Example:
    genre_string = "sci-fi, romantic-comedy, epic-war, unknown style, crime-drama"
    result = preprocess_genres(genre_string)
    print(result)  # Output might include 'scifi', 'romance', 'war', 'other', etc., based on the input and settings.
    """
    
    replacements = {
        'biodrama': 'biographical-drama',
        'docudrama': 'documentary-drama',
        'melodrama': 'drama',
        'sci-fi': 'scifi',
        'science-fiction': 'scifi',
        'science fiction': 'scifi',
        'rom com': 'romance-comedy',
        'romcom': 'romance-comedy',
        'rom-com': 'romance-comedy',
        'romantic comedy': 'romance-comedy',
        'romantic': 'romance',
        'rom-comedy': 'romance-comedy',
        'bio': 'biographical',
        'biographic': 'biographical',
        'biography': 'biographical',
        'anime': 'animated',
        'animation': 'animated'
    }

    # Convert to lowercase and perform initial splitting
    genres = re.split(r'[^a-zA-Z0-9\-]+', genre_string.lower())

    processed_genres = set()
    for genre in genres:
        # Apply replacements
        for old, new in replacements.items():
            genre = re.sub(r'\b{}\b'.format(re.escape(old)), new, genre)
        
        # Split by hyphens and validate genres
        subgenres = genre.split('-')
        for subgenre in subgenres:
            clean_subgenre = subgenre.strip()
            if clean_subgenre in ALLOWED_GENRES:
                processed_genres.add(clean_subgenre)
            elif clean_subgenre:
                processed_genres.add('other')

    return ','.join(sorted(processed_genres))

In [7]:
# Apply the preprocessing
genres_processed = pd.Series(wiki_movie_plots['Genre']).apply(preprocess_genres)
print(genres_processed)

0                 other
1                 other
2                 other
3                 other
4                 other
              ...      
34881             other
34882            comedy
34883            comedy
34884    comedy,romance
34885           romance
Name: Genre, Length: 34886, dtype: object


In [8]:
# Create a 0-1 binary column for each genre in the list, and append back to the dataframe
wiki_movie_plots_processed = pd.concat([wiki_movie_plots, genres_processed.str.get_dummies(sep=',')], axis=1)
wiki_movie_plots_processed.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot,action,adventure,...,horror,music,mystery,other,romance,scifi,sport,thriller,war,western
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr...",0,0,...,0,0,0,1,0,0,0,0,0,0
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov...",0,0,...,0,0,0,1,0,0,0,0,0,0
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed...",0,0,...,0,0,0,1,0,0,0,0,0,0
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...,0,0,...,0,0,0,1,0,0,0,0,0,0


In [9]:
# Filter rows with at least 1 specific (non-other) genre, and remove the other column.

# Filtering logic: not(other==1 and row_sum(all genre columns)==1)

# rows_with_other_genre_only = genres[(genres['other'] == 1) & (genres.sum(axis=1)==1)]
wiki_movie_plots_processed = wiki_movie_plots_processed[~((wiki_movie_plots_processed['other'] == 1) & (wiki_movie_plots_processed.iloc[:,8:].sum(axis=1)==1))].drop('other', axis=1)

In [10]:
print('>> Number of raw rows:', wiki_movie_plots.shape[0])
print('>> Number of processed rows:', wiki_movie_plots_processed.shape[0])

print(25993/34886)

>> Number of raw rows: 34886
>> Number of processed rows: 25993
0.7450839878461274


In [11]:
# Save
wiki_movie_plots_processed.to_csv('data_processed_genres.csv', index=False)

In [22]:
wiki_movie_plots_processed.iloc[:,8:].sum(axis=1).value_counts()

1    20602
2     4694
3      536
4      111
0       31
5       17
6        1
7        1
Name: count, dtype: int64