# Predicting Movie Genres from Scripts with Naive Bayes

In [1]:
import nltk
import pandas as pd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import ast
import psycopg2
import warnings
import multiprocessing
import time
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [2]:
conn = psycopg2.connect(dbname='bechdel_test', user='postgres', password='guest')
cur = conn.cursor()

cur.execute('SELECT * FROM imsdb_scripts JOIN bechdel_ratings ON imsdb_scripts.imdb_id = bechdel_ratings.imdb_id JOIN tmdb_data ON tmdb_data.imdb_id = imsdb_scripts.imdb_id;')
data = pd.DataFrame(cur.fetchall())
df = data.copy()
df.set_index(0, inplace=True)

cur.execute('SELECT genre.imdb_id, genre FROM genre JOIN imsdb_scripts ON imsdb_scripts.imdb_id = genre.imdb_id;')
genre = pd.DataFrame(cur.fetchall())
cur.close()
conn.close()

In [3]:
for genre_ in genre[1].unique():
    df[genre_] = pd.Series()
for row in genre.iterrows():
    df[row[1][1]][row[1][0]] = 1
df.rename(columns={0:'imdb_id',
                        1:'script_date',
                        2:'script',
                        3:'bechdel_id',
                        5:'title',
                        6:'release_year',
                        7:'bechdel_rating',
                        11:'language',
                        13:'popularity',
                        14:'vote_average',
                        15:'vote_count',
                        16:'overview'
                        }, 
               inplace=True)
df.drop(columns=[4, 8, 9, 10, 12], inplace=True)
df.fillna(0, inplace=True)
df.replace('none', np.nan, inplace=True)

In [4]:
def clean_text(text: str) -> list[str]:
    text = word_tokenize(text.lower())
    ls = list(string.punctuation) + stopwords.words('english') + ['...', '--', '\'\'', '``']
    i = 0
    while i < len(text):
        if text[i] in ls:
            text.remove(text[i])
        else:
            i += 1
    return text

In [None]:
df = df.dropna(subset='script')
df['clean_text'] = [clean_text(text) for text in df['script']]

In [13]:
def FeatureFunction(tokens: list[str]) -> list[tuple[str, int]]:
    return [(token, tokens.count(token)) for token in set(tokens)]
    

In [14]:
def Score(script: list[str], weights: dict[str: list[int]]):
    score = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
    
    
    for word, count in FeatureFunction(script):      
        for i in score:
            score[i] += weights[word][i] * count
    return score
        

[('distinct', 1),
 ('porter', 7),
 ('lady', 14),
 ('paging', 1),
 ('drawer', 1),
 ('gamblers', 1),
 ('otherwise', 2),
 ('contemptuously', 1),
 ('wondering', 1),
 ('postpone', 1),
 ('cynical', 1),
 ('gambler', 1),
 ('paused', 2),
 ('barring', 1),
 ('finishes', 3),
 ('packing', 2),
 ('thinking', 6),
 ('try', 3),
 ('hasty', 1),
 ('fifty-two', 1),
 ('military', 1),
 ('went', 2),
 ('lead', 1),
 ('minutes', 6),
 ('waking', 1),
 ('violently', 1),
 ('liar', 1),
 ('assuming', 1),
 ('pray', 2),
 ('coffee', 2),
 ('bought', 2),
 ('raw', 1),
 ('wound', 1),
 ('drumming', 1),
 ('stream', 3),
 ('public', 1),
 ('mouth', 6),
 ('approaching', 3),
 ('one', 58),
 ('buss-boy', 2),
 ('shrug', 1),
 ("'m", 77),
 ('side', 9),
 ('stunned', 2),
 ('yellow', 9),
 ('pillows', 3),
 ('parties', 2),
 ('rags', 5),
 ('keeping', 2),
 ('bends', 1),
 ('agreeable', 1),
 ('puts', 19),
 ('tease', 1),
 ('dirtier', 1),
 ('listlessly', 1),
 ('stopped', 5),
 ('nervously', 5),
 ('begin', 3),
 ('specialist', 2),
 ('business', 34),
 

In [75]:
genres = list(df.columns[11:-1])
total_words_per_genre = dict.fromkeys(genres, 0)
def NaiveBayes(row: pd.Series, weights: dict[str: dict[str, int]], genres: list[str]=genres,) -> dict[str: dict[str, int]]:
    genre_list = []
    for genre in genres:
        if row[genre] == 1:
            total_words_per_genre[genre] += len(row['clean_text'])
            genre_list.append(genre)
        
    for token in row['clean_text']:
       
        if token in weights:
            for genre in genre_list:
                weights[token][genre] += 1
        else: 
            weights[token] = dict.fromkeys(genres, 0)
            for genre in genre_list:
                weights[token][genre] = 1
    return weights


        

            


In [76]:
x = df.duplicated(subset='clean_text')
df = df.drop(list(x[x==True].index))
df

Unnamed: 0_level_0,script_date,script,bechdel_id,title,release_year,bechdel_rating,language,popularity,vote_average,vote_count,...,War,Comedy,Music,Western,Horror,Science Fiction,Action,Animation,History,clean_text
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22958,,GRAND H...,1328,Grand Hotel,1932,3,en,85.188,6.959,294,...,0,0,0,0,0,0,0,0,0,"[grand, hotel, written, bela, balazs, based, p..."
32138,March 1939,FADE IN -- Title:\r\n\r\nFor nearly forty year...,174,"Wizard of Oz, The",1939,3,en,81.243,7.600,5346,...,0,0,0,0,0,0,0,0,0,"[fade, title, nearly, forty, years, story, giv..."
33467,,Citizen Kane \r\n\r\n ...,1266,Citizen Kane,1941,1,en,331.301,8.008,5312,...,0,0,0,0,0,0,0,0,0,"[citizen, kane, herman, j., mankiewicz, orson,..."
113101,,"""FOUR ROOMS""\r\n\r\n ...",986,Four rooms,1995,3,en,21.231,5.829,2568,...,0,1,0,0,0,0,0,0,0,"[four, rooms, screenplay, allison, anders, ale..."
42192,,FADE IN:\r\n\nINT. DINING HALL - SARAH SIDDONS...,139,All About Eve,1950,3,en,18.633,8.100,1462,...,0,0,0,0,0,0,0,0,0,"[fade, int, dining, hall, sarah, siddons, soci..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6139732,December 1992,ALADDIN: THE COMPLETE SCRIPT\r\nCOMPILED BY B...,8750,Aladdin,2019,3,en,189.589,7.107,9763,...,0,0,0,0,0,0,0,0,0,"[aladdin, complete, script, compiled, ben, scr..."
837563,January 1986,"\t\t\t""PET SEMATARY""\r\n\r\n\t\t\t by\r\n...",8824,Pet Sematary,2019,3,en,191.301,5.739,3101,...,0,0,0,0,1,0,0,0,0,"[pet, sematary, stephen, king, fade, persisten..."
4566758,December 1998,Disney's Mulan\r\nCompiled by Barry Adams dur...,9265,Mulan,2020,3,en,61.971,6.880,6479,...,0,0,0,0,0,0,1,0,0,"[disney, 's, mulan, compiled, barry, adams, th..."
11245972,July 1995,\n SCREAM\r\n ...,10221,Scream,2022,3,en,82.686,6.700,3087,...,0,0,0,0,1,0,0,0,0,"[scream, scary, movie, kevin, williamson, rewr..."


In [98]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df.loc[:,'Drama':'History'], test_size=0.2, random_state=42)
train_df = y_train.join(X_train)

In [100]:
weights = {}
for i in list(train_df.index):
    NaiveBayes(train_df.loc[i], weights)

In [101]:
weights

{'ocean': {'Drama': 263,
  'Romance': 63,
  'Adventure': 178,
  'Fantasy': 55,
  'Family': 42,
  'Mystery': 53,
  'Crime': 79,
  'Thriller': 160,
  'War': 5,
  'Comedy': 58,
  'Music': 6,
  'Western': 2,
  'Horror': 99,
  'Science Fiction': 104,
  'Action': 132,
  'Animation': 33,
  'History': 14},
 "'s": {'Drama': 59255,
  'Romance': 19739,
  'Adventure': 17533,
  'Fantasy': 13193,
  'Family': 5554,
  'Mystery': 16497,
  'Crime': 28415,
  'Thriller': 43452,
  'War': 1788,
  'Comedy': 33158,
  'Music': 2637,
  'Western': 1178,
  'Horror': 16578,
  'Science Fiction': 19778,
  'Action': 27271,
  'Animation': 4203,
  'History': 5531},
 'twelve': {'Drama': 183,
  'Romance': 57,
  'Adventure': 51,
  'Fantasy': 26,
  'Family': 10,
  'Mystery': 60,
  'Crime': 105,
  'Thriller': 142,
  'War': 3,
  'Comedy': 134,
  'Music': 5,
  'Western': 3,
  'Horror': 55,
  'Science Fiction': 69,
  'Action': 84,
  'Animation': 7,
  'History': 16},
 'written': {'Drama': 307,
  'Romance': 90,
  'Adventure': 77

In [28]:
df.loc[22958]['Drama']

1

In [23]:
df.loc[22958,'Drama':'clean_text']

Drama                                                              1
Romance                                                            1
Adventure                                                          0
Fantasy                                                            0
Family                                                             0
Mystery                                                            0
Crime                                                              0
Thriller                                                           0
War                                                                0
Comedy                                                             0
Music                                                              0
Western                                                            0
Horror                                                             0
Science Fiction                                                    0
Action                            

In [25]:
list(df.columns[11:-1])

['Drama',
 'Romance',
 'Adventure',
 'Fantasy',
 'Family',
 'Mystery',
 'Crime',
 'Thriller',
 'War',
 'Comedy',
 'Music',
 'Western',
 'Horror',
 'Science Fiction',
 'Action',
 'Animation',
 'History']