# Screenplay Genre Classification

## Data Wrangling I

In [None]:
#importing necessary packages
import pandas as pd
import numpy as np
import os

### This function is used to load in the previous downloaded and converted text files from the data_collection.ipynb, and save the text and title to a python dictionary.

In [359]:
#initializing dict
screenplays = {'title': [], 'text': []}

def screenplays_loader(dct):
    
    """
    This function takes in a dct as parameters and returns an updated dct
    with title and text keys, and lists of titles and screenplay text respectively
    
    """

    directory = os.fsdecode('script_texts/')

    for file in os.listdir(directory):

        filename = os.fsdecode(file)
        text = open(directory + '/' + filename, 'rb').read()
        if len(text) > 0:
            dct['title'].append(filename.strip('.txt'))
            dct['text'].append(text)
        else:
            continue




In [360]:
#running the function
screenplays_loader(screenplays)

In [361]:
#checking the length
len(screenplays['title'])

2125

In [362]:
#converting the dict into a pandas dataframe
data = pd.DataFrame(screenplays)

### The titles were saved with the scrip tag, underscores (_) and uneven spacing. It was necessary to clean the title names before using the api to retrieve genre labels for the respective titles.

In [363]:
import re

#cleaning titles
data['title'] = data.title.str.replace('scrip', '')
data['title'] = data.title.str.replace('_', ' ')
data['title'] = data.title.apply(lambda x: re.sub(r"\B([A-Z])", r" \1", x))


### I used the tmdbsimple python package which is a wrapper that connects me to the the Movie Database API. https://www.themoviedb.org/

In [365]:
#connecting to the movie database to match genre with title

import tmdbsimple as tmdb

tmdb.api_key = 'YOUR SECRET CODE' #codes are avaliable for free when signing up on their website

#search object that looks up movie information by title
search = tmdb.Search()

#genre object
genre = tmdb.Genres()

#saving geres and coressponding codes for labelling
genres_lst = genre.movie_list()

In [366]:
def genre_labeller(row):

    """
    This function takes a movie title as a parameter
    and returns a list of genres associated with respected movie

    """
    
    lst = []
    
    #the api could have multiple results
    #this function tries different results if genre list is empty in first result
    #up to three results
    try:
        for i in search.movie(query=row)['results'][0]['genre_ids']: #loops through list of genre ids
            for x in genres_lst['genres']: #loops through list of genres
                if x['id'] == i: #appends genre to lst if id matches the id from the query
                    lst.append(x['name'])
                else:
                    continue
    except:
        pass

    if len(lst) == 0: #tries another query if the first one fails, up to three times
        try:
            for i in search.movie(query=row)['results'][1]['genre_ids']:
                for x in genres_lst['genres']:
                    if x['id'] == i:
                        lst.append(x['name'])
                    else:
                        continue
        except:
            pass

    if len(lst) == 0:
        try:
            for i in search.movie(query=row)['results'][2]['genre_ids']:
                for x in genres_lst['genres']:
                    if x['id'] == i:
                        lst.append(x['name'])
                    else:
                        continue
        except:
            pass

    #some movie titles have 'the' before title name
    if len(lst) == 0:
        try:
            for i in search.movie(query='The '+ row)['results'][0]['genre_ids']:
                for x in genres_lst['genres']:
                    if x['id'] == i:
                        lst.append(x['name'])
                    else:
                        continue
        except:
            pass

    
    return lst

In [367]:
#applying function on all titles in dataset
data['genre'] = data.title.apply(lambda x: genre_labeller(x))

In [368]:
#creating a column with genre amount
data['genre_amount'] = data.genre.apply(lambda x: len(x))

In [369]:
df = data[data.genre_amount != 0] #only want screenplays that have a labelled genre

28 Hours Later 
A Crowded Room 
Addy Longhair 
Alien Engineers
All The Pretty Dead Girls
All The Best Cowboys Have Daddy Issues
American Shaolin King Of Kickboxers I I
An October Wedding
Android Army
Artic Blue
Attack on Ant Hill
Avengers Worlds Collide
Baby Moon
Back To The Future 2&3
Ballad Of The Whiskey Robber
Bar Room Hero
Batman2
Bay Watch
Beverly Hills Cops
Bizzaro
Blaire Witch 2
Boodock Saints 2
Bounty Jumpers
Boy Who Never Slep
Bring Out The Dead
Californication
Candle To Water
Carrie (2013)
Cheating Season
Chewie
Christ Comple
Chronicle 2 Martyr
Cinema Paradisco
Connan The Barbarian
Dark Late At Nigh
Darwins Game
Dave Barrys Complete Guide To Guys
Dawn Of The Dead (1978)
Deep Sky
Diamond Dead
Dr Faustus
Dr Strange Love
Edward Ford
En Cryp
Fight Belle
First Blood Vendetta
Foggs Millions
Forest Gump
Frankenstein V2
Fraud Is Dead
Fright Night (2011)
Fruit Vale Station
Gideon’s Law
Glen Garry Glen Ross
God Father Part 2
God Father Part 3
Godzilla 1994
Greenbergh
Ground Hog Day
Ha

In [370]:
#saving df to a csv file
df.to_csv('data/txt_and_genres.csv')