Here integrating all previous notebooks into one pipeline, mainly to avoid uploading and downloading json files, which seems to introduce corruptions. 

# 0. Import Data and Dependencies

# 0.0 Packages and Dependencies

In [5]:
import numpy as np
import pandas as pd
import os 
import re

## 0.1 Notebook Functions

In [None]:
def print_first_lines(dict_list, n):
    for idx, d in enumerate(dict_list):
        if idx == n:
            break
        else:
            print(d)

In [None]:
def find_avg_length(series):
    avg_length = np.mean([len(d) for d in series])
    print("Average length:", avg_length)

In [14]:
def format_as_json(screenplay):
    # store results as list of key-value pairs
    screenplay_data = []
    # split screenplays by \n
    lines = screenplay.split("\n")
    # iterate through lines 
    for line in lines: 
        # take part of string up to : as label
        match = re.search(r':', line)
        if match:
            # take end of match as cutoff
            cutoff = match.end()
            label = line[:cutoff-1]
            # after cutoff is data 
            data = line[cutoff+1:]
            # store as dict
            line_info = {label:data}
            # append to list
            screenplay_data.append(line_info)
    # return list
    return screenplay_data

## 0.2 Read Datafiles 

In [2]:
# replace paths here
root_path = r'C:\\Users\bened\DataScience\ANLP\AT2'

folder_path = f'{root_path}\\BERT_annotations'
screenplays_annot = {}

hex_pat = re.compile(r'[\x00-\x1F\x7F-\x9F]')

# list all files in folder and iterate over them 
for file_name in os.listdir(folder_path):
    # get file_path by joining folder path with file_name
    file_path = os.path.join(folder_path, file_name)
    # ensure path points to an actual file
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
            # remove non-printable non-ASCII chars
            cleaned_content = re.sub(hex_pat, '', content)
            # assign content to its file_name
            screenplays_annot[file_name] = content

# ensure files were imported correctly by printing a sample of the first ten files 
i = 0
for file_name, content in screenplays_annot.items():
    if i == 10:
        break
    else:
        print(f"Example of {file_name}:\n")
        print(content[:100])
        print("-"*50)
        i += 1

Example of 10 Cloverfield Lane_1179933_anno.txt:

dialog: The Cellar
dialog: by
dialog: Josh Campbell & Matt Stuecken
speaker_heading: DARKNESS
dialog
--------------------------------------------------
Example of 10 Things I Hate About You_0147800_anno.txt:

dialog: 
text: TEN THINGS I HATE ABOUT YOU
dialog: 
dialog: written by Karen McCullah Lutz &amp; Kir
--------------------------------------------------
Example of 12 Angry Men_0118528_anno.txt:

scene_heading: PLEASE COPY AND RETURN |
dialog: ———_————_
dialog: 
scene_heading: TWELVE ANGRY MEN
d
--------------------------------------------------
Example of 12 Monkeys_0114746_anno.txt:

dialog: 
speaker_heading: TWELVE MONKEYS
dialog: 
dialog: An original screenplay by
dialog: David Pe
--------------------------------------------------
Example of 12 Years a Slave_2024544_anno.txt:

dialog: 
speaker_heading: 12 YEARS A SLAVE
dialog: Written by
dialog: John Ridley
speaker_heading: C
--------------------------------------------------
Ex

In [3]:
# search values of screenplays_annot for data corruption pattern 
corruptions = set()
for val in screenplays_annot.values():
    corrupted = re.findall(hex_pat, repr(val))
    for c in corrupted:
        corruptions.add(c)

print(corruptions)

set()


In [4]:
# find names of screenpay with hexademical chars 
counter = 0
for key, val in screenplays_annot.items():
    if re.search(hex_pat, repr(val)):
        print(key)
        counter += 1

print(f"{counter} files contain hexadecimal chars")

0 files contain hexadecimal chars


## 0.3 Assemble DataFrame

In [None]:
# ! pip install pandas

Collecting pandas
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.2-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl (11.6 MB)
Using cached pytz-2024.2-py2.py3-none-any.whl (508 kB)
Using cached tzdata-2024.2-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2024.2 tzdata-2024.2


In [8]:
meta_df = pd.read_csv(f'{root_path}\\movie_meta_data.csv')
meta_df.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,...,casting directors,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...",...,Jeff Greenberg,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler",...,"Kerry Barden, Billy Hopkins, Suzanne Smith","Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub",...,Susie Figgis,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron",...,Aisha Coley,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix..."
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...",...,John S. Lyons,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...


In [9]:
# take a look at filename format
filenames = list(screenplays_annot.keys())
print(filenames[:10])

['10 Cloverfield Lane_1179933_anno.txt', '10 Things I Hate About You_0147800_anno.txt', '12 Angry Men_0118528_anno.txt', '12 Monkeys_0114746_anno.txt', '12 Years a Slave_2024544_anno.txt', '127 Hours_1542344_anno.txt', '13 13 13_2991516_anno.txt', '1408_0450385_anno.txt', '1492 Conquest of Paradise_0103594_anno.txt', '15 Minutes_0179626_anno.txt']


In [10]:
# filenames are formatted as movietitle_IMDBid 
filenames = list(screenplays_annot.keys())
movie_titles = []
ids = []
for f in filenames:
    # split at first _ to separate title from rest of filename
    split = f.split(sep="_")
    movie_title = split[0]
    id = split[1]
    movie_titles.append(movie_title)
    ids.append(id)
i = 0
for title, id in zip(movie_titles, ids):
    if i == 10:
        break
    else:
        print("Title:", title, " ID:", id)
        i += 1

Title: 10 Cloverfield Lane  ID: 1179933
Title: 10 Things I Hate About You  ID: 0147800
Title: 12 Angry Men  ID: 0118528
Title: 12 Monkeys  ID: 0114746
Title: 12 Years a Slave  ID: 2024544
Title: 127 Hours  ID: 1542344
Title: 13 13 13  ID: 2991516
Title: 1408  ID: 0450385
Title: 1492 Conquest of Paradise  ID: 0103594
Title: 15 Minutes  ID: 0179626


In [11]:
screenplays_df = pd.DataFrame({
    'imdbid': ids,
    'annot_screenplay': screenplays_annot.values()
})
screenplays_df.head()

Unnamed: 0,imdbid,annot_screenplay
0,1179933,dialog: The Cellar\ndialog: by\ndialog: Josh C...
1,147800,dialog: \ntext: TEN THINGS I HATE ABOUT YOU\nd...
2,118528,scene_heading: PLEASE COPY AND RETURN |\ndialo...
3,114746,dialog: \nspeaker_heading: TWELVE MONKEYS\ndia...
4,2024544,dialog: \nspeaker_heading: 12 YEARS A SLAVE\nd...


In [12]:
print(screenplays_df.info())
print(meta_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998 entries, 0 to 1997
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   imdbid            1998 non-null   object
 1   annot_screenplay  1998 non-null   object
dtypes: object(2)
memory usage: 31.3+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   imdbid                     2858 non-null   int64 
 1   title                      2858 non-null   object
 2   akas                       2652 non-null   object
 3   year                       2858 non-null   int64 
 4   metascore                  2858 non-null   int64 
 5   imdb user rating           2858 non-null   int64 
 6   number of imdb user votes  2858 non-null   int64 
 7   awards                     2243 non-null   object
 8   op

In [13]:
# convert screenplays imdbid to int
screenplays_df['imdbid'] = screenplays_df['imdbid'].astype(int)
df = meta_df.merge(screenplays_df, on='imdbid')
df.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,...,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,annot_screenplay
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...",...,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,,dialog: \ntext: A NIGHT AT THE ROXBURY\ndialog...
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler",...,"Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",,scene_heading: AT FIRST SIGHT\nscene_heading: ...
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub",...,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",,dialog: \nspeaker_heading: THE AVENGERS\ndialo...
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron",...,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix...",dialog: Bamboozled\ndialog: by\ndialog: Spike ...
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...",...,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...,dialog: \nscene_heading: THE BIG LEBOWSKI\ntex...


In [16]:
del screenplays_df, meta_df, content, cleaned_content, filenames

# 1. Preprocessing

## 1.0 Format Text Data as JSONs

In [18]:
# now we'll apply this logic to the whole corpus to see if labels are the same
screenplay_jsons = df['annot_screenplay'].apply(format_as_json)

df.head()

In [20]:
df.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,...,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,annot_screenplay
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...",...,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,,dialog: \ntext: A NIGHT AT THE ROXBURY\ndialog...
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler",...,"Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",,scene_heading: AT FIRST SIGHT\nscene_heading: ...
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub",...,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",,dialog: \nspeaker_heading: THE AVENGERS\ndialo...
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron",...,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix...",dialog: Bamboozled\ndialog: by\ndialog: Spike ...
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...",...,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...,dialog: \nscene_heading: THE BIG LEBOWSKI\ntex...


## 1.1 Analyze labels

In [19]:
def find_unique_labels(json):
    unique_labels = {key for d in json for key in d.keys()}
    return unique_labels

unique_labels_series = screenplay_jsons.apply(find_unique_labels)

### 1.1.1 Find and drop rows where data is empty

In [21]:
unequal_length = []
for series in unique_labels_series:
    if len(series) != 4:
        unequal_length.append(series)

print(unequal_length)

[{'speaker_heading', 'dialog', 'text'}, {'dialog', 'scene_heading', 'text'}, set(), set(), set()]


In [22]:
# some annotations have only three labels, which is fine, but others appear to be empty, which we should investigate
empty_series = []
for idx, series in enumerate(unique_labels_series):
    if len(series) == 0:
        empty_series.append(idx)

In [23]:
missing_imdbid = df.loc[empty_series, 'imdbid']
df.loc[empty_series]

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,...,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,annot_screenplay
1034,1837703,The Fifth Estate,"The 5ifth Estate (United States), The Man Who ...",2013,49,6,38595,"Britannia Award 2013, COFCA Award 2014, Audien...",United States:,"Leifur B. Dagfinnsson, Hilde De Laere, Steve G...",...,"Peter Capaldi, David Thewlis, Anatole Taubman,...","United States, India, Belgium","Argentina:13, Australia:M, Canada:PG::(British...",A dramatic thriller based on real events that ...,The story begins as WikiLeaks founder Julian A...,"internet, pantyhose, red-pantyhose, female-sto...","Biography, Crime, Drama, Thriller",You can't expose the world's secrets without e...,,
1477,99892,Joe Versus the Volcano,"Joe contre le volcan (France), Joe gegen den V...",1990,45,5,34277,Felix 2011,United States:,"Kathleen Kennedy, Frank Marshall, Roxanne Roge...",...,"Tom Hanks, Meg Ryan, Lloyd Bridges, Robert Sta...",United States,"Argentina:13, Australia:PG, Canada:PG, Canada:...","When a hypochondriac learns that he is dying, ...",Joe versus the Volcano is a fable which opens ...,"tom-hanks, surrealism, terminal-illness, suici...","Comedy, Romance","An Average Joe. An Adventurous Comedy., A stor...",,
1639,4364194,The Peanut Butter Falcon,"Le Cri du faucon (France), La familia que tú e...",2019,70,7,61007,"AFCA Award 2020, Audience Award 2019, Special ...",United States:,"Albert Berger, Carmella Casinelli, Manu Gargi,...",...,"Zack Gottsagen, Ann Owens, Dakota Johnson, Bru...",United States,"Australia:M, Austria:10, Belgium:KT/EA, Canada...",Zak runs away from his care home to make his d...,The Peanut Butter Falcon is an adventure story...,"down-syndrome, wrestling, friendship, bare-che...","Adventure, Comedy, Drama",,,


If you look at the source data, you'll find the .txt files for these screenplays are simply empty.  We'll drop them.

In [26]:
df_clean = df.drop(empty_series)
df_clean.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,...,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,annot_screenplay
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...",...,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,,dialog: \ntext: A NIGHT AT THE ROXBURY\ndialog...
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler",...,"Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",,scene_heading: AT FIRST SIGHT\nscene_heading: ...
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub",...,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",,dialog: \nspeaker_heading: THE AVENGERS\ndialo...
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron",...,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix...",dialog: Bamboozled\ndialog: by\ndialog: Spike ...
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...",...,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...,dialog: \nscene_heading: THE BIG LEBOWSKI\ntex...


In [27]:
screenplay_jsons.drop(empty_series, inplace=True)

So we have 'scene_heading', 'speaker_heading', 'text' and 'dialog' labels.  Let's look at an example screenplay to see what might be worth removing. 

## 1.2 Flattening Contiguous Data

In some cases we see the same sentences spread over different values, while key is the same.  The function below will find these contiguous values and flatten them into one value.  This will make sentence tokenization more meaningful later on. 

In [28]:
# we'll define a more general function this time that takes a key input
def flatten_data(dict_list, key):
    flattened_data = []
    temp = ''
    for d in dict_list:
        if key in d:
            temp += ' ' + d[key] if temp else d[key]
        else:
            # if a key other than input is encountered and temp is not empty
            if temp:
                # append the concatenated string to text list 
                flattened_data.append({key:temp})
                # and reset temp 
                temp = ''
            # append non text dict to list 
            flattened_data.append(d)
    # after loop ends, concatenate what's left in temp if anything
    if temp:
        flattened_data.append({key:temp})
    # and return concatenated list
    return flattened_data

In [29]:
del df, df_clean, ids, movie_titles, screenplays_annot, unique_labels_series, val

In [31]:
screenplays_flat_txt = screenplay_jsons.apply(flatten_data, key='text')
screenplays_flat = screenplays_flat_txt.apply(flatten_data, key='dialog')

In [32]:
del screenplays_flat_txt

In [33]:
del screenplay_jsons

## 1.3 Removing 'speaker_heading'

In [30]:
def decapitate_speakers(json_list):
    decapitated = [d for d in json_list if not 'speaker_heading' in d]
    return decapitated 

In [34]:
decapitated_screenplays = screenplays_flat.apply(decapitate_speakers)

In [35]:
del screenplays_flat

## 1.4 Remove empty strings and strings that are only punctuations 

In [37]:
# remove empty strings

def remove_nulls(json_list):
    non_nulls = []
    for dict in json_list:
        valid = True
        for val in dict.values():
            if val == '':
                valid = False
                break
        if valid:
            non_nulls.append(dict)
    return non_nulls

In [38]:
# apply to all data 
screenplays_nonna = decapitated_screenplays.apply(remove_nulls)
print(screenplays_nonna[10][:100])

[{'scene_heading': 'THE LAST OF THE MOHICANS'}, {'dialog': 'Written by Michael Mann &amp; Christopher Crowe'}, {'text': "The screen is a microcosm of leaf, crystal drops of precipitation, a stone, emerald green moss. It's a landscape in miniature. We HEAR the forest. Some distant birds. Their sound seems to reverberate as if in a cavern. A piece of sunlight refracts within the drops of water, paints a patch of moss yellow. The whisper of wind is joined by another sound that mixes with it. A distant rustling. It gets closer and louder. It's shallow breathing. It gets ominous. We're interlopers on the floor of the forest and something is coming."}, {'scene_heading': 'SUDDENLY: A MOCCASINED FOOT'}, {'text': 'rockets through the frame scaring us and ...'}, {'scene_heading': 'EXTREMELY CLOSE: PART OF AN INDIAN FACE'}, {'text': "running hard. His head shaved bald except for a scalp-lock. Tattoos. He's twenty-five. He seems tall and muscled. Heavy, even breathing. We'll learn later"}, {'dialo

In [39]:
del decapitated_screenplays

## 1.5 Delete 'CUT' 

In [41]:
def delete_cuts(dict_list):
    # empty list for filtered dicts
    dicts_uncut = []
    for d in dict_list:
        # if none of the values in the dict match 'CUT'
        if all(not re.search(r'CUT', str(val), flags=re.IGNORECASE) for val in d.values()):
            # then append to list
            dicts_uncut.append(d)
    return dicts_uncut 

In [42]:
screenplays_uncut = screenplays_nonna.apply(delete_cuts)

avg length before: 1853.7062656641604
avg_length_after: 1835.842105263158


In [43]:
del screenplays_nonna

## 1.6 Sentence Tokenization

We'll sentence tokenize the values first before removing punctuation marks etc. 

In [44]:
import nltk
from nltk.tokenize import sent_tokenize

In [45]:
# define as a general function
def sent_tokenize_dicts(dict_list):

    sentence_dicts = []

    for d in dict_list:
        # empty dict for storing result 
        sents_dict = {}
        for key, value in d.items():
            sents_dict[key] = sent_tokenize(value)
            sentence_dicts.append(sents_dict)
    
    return sentence_dicts

In [46]:
# apply all 
screenplay_sents = screenplays_uncut.apply(sent_tokenize_dicts)

In [47]:
print_first_lines(screenplay_sents[50], 10)

{'text': ['TEN THINGS I HATE ABOUT YOU']}
{'dialog': ['written by Karen McCullah Lutz &amp; Kirsten Smith  based on \'Taming of the Shrew" by William Shakespeare  Revision November 12, 1997']}
{'scene_heading': ['PADUA HIGH SCHOOL - DAY']}
{'dialog': ['Welcome to Padua High School,, your typical urban-suburban high school in Portland, Oregon.', 'Smarties, Skids, Preppies,']}
{'text': ['Granolas.', 'Loners, Lovers, the In and the Out Crowd rub sleep out of their eyes and head for the main building.']}
{'scene_heading': ['PADUA HIGH PARKING LOT - DAY']}
{'text': ["KAT STRATFORD, eighteen, pretty -- but trying hard not to be -- in a baggy granny dress and glasses, balances a cup of coffee and a backpack as she climbs out of her battered, baby blue '75 Dodge Dart."]}
{'text': ['A stray SKATEBOARD clips her, causing her to stumble and spill her coffee, as well as the contents of her backpack.']}
{'text': ['The young RIDER dashes over to help, trembling when he sees who his board has hit.']}

In [50]:
del screenplays_uncut

## 1.7 Label Encoding

At this point we're going to encode our labels just to save on memory. 

In [49]:
label_map = {
    'scene_heading': np.int8(0),
    'text': np.int8(1),
    'dialog': np.int8(2)
}

In [51]:
# define as function and apply all 

def encode_labels(dict_list):
    encoded_list = []
    for d in dict_list:
        encoded_dict = {label_map[key]: value for key, value in d.items()}
        encoded_list.append(encoded_dict)
    return encoded_list

screenplays_encoded = screenplay_sents.apply(encode_labels)
print(screenplays_encoded[0][:10])

[{1: ['A NIGHT AT THE ROXBURY']}, {2: ['written by Steve Koren Will Ferrell & Chris Kattan June 2, 1997']}, {0: ['EXT.', 'PANORAMIC VIEW OF LOS ANGELES - SUNSET']}, {1: ['As we hear "What is Love" by HADDAWAY -- night falls and partytime begins.']}, {0: ['SUPERIMPOSE: SUNSET BLVD., 11:03 PM']}, {0: ['EXT.', 'DANCE CLUBS - NIGHT']}, {2: ['Coconut Teaser, The Palace, The Roxbury, Tatou, etc.']}, {0: ['INT.', 'DANCE CLUBS- QUICK SHOTS - NIGHT']}, {1: ['Of random dancers -- gyrating, flirting, making out, drinking.']}, {0: ['INT.', 'PALACE - NIGHT']}]


In [53]:
del screenplay_sents

## 1.8 Sentence Cleaning

We can remove all sentences which contain only EXT/INT

In [52]:
def remove_location(dict_list):
    for d in dict_list:
        for key, value in d.items():
            d[key] = [sent for sent in value if sent not in ['EXT.', 'INT.', 'ext.', 'int.']]
    return dict_list

In [54]:
# apply all
screenplays_unlocated = screenplays_encoded.apply(remove_location)
print(screenplays_unlocated[10][:10])

[{0: ['THE LAST OF THE MOHICANS']}, {2: ['Written by Michael Mann &amp; Christopher Crowe']}, {1: ['The screen is a microcosm of leaf, crystal drops of precipitation, a stone, emerald green moss.', "It's a landscape in miniature.", 'We HEAR the forest.', 'Some distant birds.', 'Their sound seems to reverberate as if in a cavern.', 'A piece of sunlight refracts within the drops of water, paints a patch of moss yellow.', 'The whisper of wind is joined by another sound that mixes with it.', 'A distant rustling.', 'It gets closer and louder.', "It's shallow breathing.", 'It gets ominous.', "We're interlopers on the floor of the forest and something is coming."]}, {0: ['SUDDENLY: A MOCCASINED FOOT']}, {1: ['rockets through the frame scaring us and ...']}, {0: ['EXTREMELY CLOSE: PART OF AN INDIAN FACE']}, {1: ['running hard.', 'His head shaved bald except for a scalp-lock.', 'Tattoos.', "He's twenty-five.", 'He seems tall and muscled.', 'Heavy, even breathing.', "We'll learn later"]}, {2: ['

In [56]:
del screenplays_encoded

In [55]:
print_first_lines(screenplays_unlocated[200], 10)

{2: ['Written by Jon Lucas &amp; Scott Moore July 31, 2009']}
{0: ['OPEN ON: PEACEFUL BLACK STILLNESS']}
{1: ['Then we hear a baby SCREAM BLOODY MURDER.', 'Then a second baby joins in, even more shrill than the first.', 'Finally, we hear']}
{2: ['the worst two words a parent can ever hear:']}
{2: ['Your turn.']}
{2: ['Fuck.']}
{0: ['SUBURBAN HOUSE -- NIGHT']}
{1: ['DAVE LOCKWOOD, 30, bleary-eyed father of three, shuffles through his well-appointed suburban home, passing a grandfather clock reading 3:45.', 'He stumbles over a TOY GIRAFFE -- it SQUEAKS, and Dave sleepily mumbles:']}
{2: ['Sorry Hank.']}
{0: ['NURSERY-- NIGHT']}


## 1.9 Word Token Processing

In [57]:
from nltk.tokenize import word_tokenize

def word_tokenize_dicts(dict_list):
    # iterate through dict list
    for d in dict_list:
        # iterate through keys and values 
        for key, value in d.items():
            d[key] = [word_tokenize(sent) for sent in value]
    return dict_list

unfortunate that we're now dealing with lists of dicts of lists of lists :/  but not sure how to remedy that without losing sentence boundaries

In [58]:
# apply all 
screenplays_tokenized = screenplays_unlocated.apply(word_tokenize_dicts)
print_first_lines(screenplays_tokenized[0], 10)

{1: [['A', 'NIGHT', 'AT', 'THE', 'ROXBURY']]}
{2: [['written', 'by', 'Steve', 'Koren', 'Will', 'Ferrell', '&', 'Chris', 'Kattan', 'June', '2', ',', '1997']]}
{0: [['PANORAMIC', 'VIEW', 'OF', 'LOS', 'ANGELES', '-', 'SUNSET']]}
{1: [['As', 'we', 'hear', '``', 'What', 'is', 'Love', "''", 'by', 'HADDAWAY', '--', 'night', 'falls', 'and', 'partytime', 'begins', '.']]}
{0: [['SUPERIMPOSE', ':', 'SUNSET', 'BLVD.', ',', '11:03', 'PM']]}
{0: [['DANCE', 'CLUBS', '-', 'NIGHT']]}
{2: [['Coconut', 'Teaser', ',', 'The', 'Palace', ',', 'The', 'Roxbury', ',', 'Tatou', ',', 'etc', '.']]}
{0: [['DANCE', 'CLUBS-', 'QUICK', 'SHOTS', '-', 'NIGHT']]}
{1: [['Of', 'random', 'dancers', '--', 'gyrating', ',', 'flirting', ',', 'making', 'out', ',', 'drinking', '.']]}
{0: [['PALACE', '-', 'NIGHT']]}


In [59]:
del screenplays_unlocated

### 1.9.1  Remove tokens with no letters

In [60]:
import string
puncts = list(string.punctuation)
print(puncts)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [61]:
import re 

def contains_letters(token):
    return bool(re.search(r'[a-zA-Z]', token))

def remove_non_letters(dict_list):
    for d in dict_list:
        for key, value in d.items():
            d[key] = [
                [t for t in sentence if contains_letters(t)]
                for sentence in value
            ]
    return dict_list 

In [62]:
# seems to work so apply all 
screenplays_alpha = screenplays_tokenized.apply(remove_non_letters)
print_first_lines(screenplays_alpha[0], 10)

{1: [['A', 'NIGHT', 'AT', 'THE', 'ROXBURY']]}
{2: [['written', 'by', 'Steve', 'Koren', 'Will', 'Ferrell', 'Chris', 'Kattan', 'June']]}
{0: [['PANORAMIC', 'VIEW', 'OF', 'LOS', 'ANGELES', 'SUNSET']]}
{1: [['As', 'we', 'hear', 'What', 'is', 'Love', 'by', 'HADDAWAY', 'night', 'falls', 'and', 'partytime', 'begins']]}
{0: [['SUPERIMPOSE', 'SUNSET', 'BLVD.', 'PM']]}
{0: [['DANCE', 'CLUBS', 'NIGHT']]}
{2: [['Coconut', 'Teaser', 'The', 'Palace', 'The', 'Roxbury', 'Tatou', 'etc']]}
{0: [['DANCE', 'CLUBS-', 'QUICK', 'SHOTS', 'NIGHT']]}
{1: [['Of', 'random', 'dancers', 'gyrating', 'flirting', 'making', 'out', 'drinking']]}
{0: [['PALACE', 'NIGHT']]}


In [63]:
del screenplays_tokenized

### 1.9.2 Cut micro-tokens

In [64]:
def cut_single_chars(dict_list):
    for d in dict_list:
        for key, value in d.items():
            d[key] = [
                [w for w in sentence if len(w) > 1]
                for sentence in value]
    return dict_list

In [65]:
# apply all 
screenplays_poly = screenplays_alpha.apply(cut_single_chars)
print_first_lines(screenplays_poly[250], 10)

{2: [['Written', 'by', 'Rhett', 'Reese', 'amp', 'Paul', 'Wernick', 'Final', 'Shooting', 'Script', 'November']]}
{1: [['OVER', 'BLACK'], ['Low', 'volume', 'through', 'tinny', 'speaker', 'JUICE', 'NEWTON', "'S", 'ANGEL', 'OF', 'THE', 'MORNING']]}
{0: [['EXT./INT'], ['TAXI', 'CAB', 'MORNING']]}
{1: [['DEADPOOL', 'in', 'full', 'DRESS', 'REDS', 'and', 'MASK', 'quietly', 'FIDGETS', 'in', 'the', 'BACK', 'SEAT', 'of', 'TAXI', 'CAB', 'as', 'it', 'proceeds', 'along', 'CITY', 'FREEWAY'], ['Deadpool', 'adjusts', 'the', 'two', 'KATANAS', 'strapped', 'to', 'his', 'back'], ['Rolls', 'the', 'WINDOWS', 'up', 'down', 'up'], ['Tries', 'futilely', 'to', 'untwist', 'the', 'seatbelt', 'then', 'LUNGES', 'forward', 'locking', 'it', 'up'], ['Rifles', 'through', 'tourist', 'booklet', 'and', 'tears', 'out', 'HAUNTED', 'SEGWAY', 'TOUR', 'coupon'], ['The', 'CABBIE', 'young', 'thin', 'brown', 'glances', 'back', 'and', 'forth', 'from', 'the', 'rear', 'view', 'to', 'the', 'road', 'to', 'the', 'rear', 'view']]}
{2: [[

In [66]:
del screenplays_alpha

### 1.9.3 Stopword Removal

In [67]:
from nltk.corpus import stopwords

stops = stopwords.words('english')
extra_stops = [
'fox', 'searchlight', 'pictures', 'inc', 'los', 'angeles', 'ca',
'all', 'rights', 'reserved', 'copyright', 'willow', 'and', 'oak', 'inc.', 'no',
'portion', 'of', 'this', 'script', 'may', 'be', 'performed', 'published', 'reproduced',
'sold', 'or', 'distributed', 'by', 'any', 'means', 'or', 'quoted', 'or', 'published', 'in', 'any',
r'ext./int', 'amp', "'ll", 'ext', 'int'
]

for s in extra_stops:
    if s not in stops:
        stops.append(s)

def remove_stops(dict_list):
    for d in dict_list:
        for key, value in d.items():
            d[key] = [
                [w for w in sentence if w.lower() not in stops]
                for sentence in value]
    return dict_list

screenplays_nonstop = screenplays_poly.apply(remove_stops)

In [68]:
print_first_lines(screenplays_nonstop[60], 10)

{2: [['Written', 'Brian', 'Helgeland', 'White', 'March', 'Blue', 'Revised', 'April', 'Pink', 'Revised', 'April', 'Yellow', 'Revised', 'April', 'Green', 'Revised', 'April', 'Goldenrod', 'Revised', 'Buff', 'Revised', 'Salmon', 'Revised', 'June', 'Cherry', 'Revised', 'June', 'Tan', 'Revised', 'June', 'Double', 'White', 'Revised', 'June', 'Double', 'Blue', 'Revised', 'July']]}
{0: [['PINK', 'REV']]}
{1: [['Fills', 'screen'], ['Falling', 'top', 'frame', 'bottom'], ['Pluming', 'dust'], ['White', 'white', 'white'], ['move', 'toward', 'even', 'recedes', 'always', 'reach'], ['Finally', 'pop', 'wide', 'high', 'reveal'], ['white', 'chalk'], ['old', 'BLACK', 'GROUNDSKEEPER', 'lays', 'right', 'field', 'line', 'baseball', 'diamond']]}
{0: [[], ['BRANCH', 'RICKEY', "'S", 'OFFICE', 'MONTAGUE', 'ST', 'BROOKLYN', 'DAY']]}
{1: [['Blinds', 'closed'], ['Dust', 'motes', 'air'], ['large', 'GOLDFISH', 'TANK', 'bubbles'], ['BRANCH', 'RICKEY', 'desk'], ['Two', 'photos', 'wall', 'Abe', 'Lincoln', 'Leo', 'Duroche

In [69]:
del screenplays_poly

### 1.9.4 Remove Empties

In [70]:
def remove_empties(dict_list):
    for d in dict_list:
        for key, value in d.items():
            d[key] = [sent for sent in value if sent]
    return dict_list

cleaned_screenplays = screenplays_nonstop.apply(remove_empties)
print_first_lines(cleaned_screenplays[44], 10)

{0: [['HEAR', 'SLOW', 'RHYTHMIC', 'CLACK-CLACK-CLACK', 'HARD', 'INSISTENT']]}
{1: [['BEAT', 'PETRIFIED', 'HEART'], ['SUDDENLY', 'FLUORESCENT', 'LIGHT', 'INVADES', 'SPACE', 'HARSH', 'UGLY', 'FIND', 'FEATURELESS', 'WHITE', 'CORRIDOR', 'SEEMS', 'STRETCH', 'INFINITY'], ['SOFT', 'MUFFLED', 'COMING', 'HIDDEN', 'DIMENSION', 'HEAR', 'CRAZED', 'LAUGHTER', 'PATHETIC', 'WHIMPERING']]}
{0: [['WILD', 'CANINE', 'YELPING', 'SOUNDS', 'MADNESS', 'EMANATING', 'WALLS']]}
{2: [['Sometimes', "'m", 'late', 'night', 'think', 'tunnel', 'know', 'one', "'re", 'supposed', 'see']]}
{1: [['die'], ["'s", 'door', 'far', 'end']]}
{2: [['side', 'either', 'heaven', 'hell'], ['beat', 'mind', "'s", 'door']]}
{1: [['FAR', 'END', 'CORRIDOR'], ["'S", 'WINDOW']]}
{2: [['Um'], ['Dr.', 'Howard', 'said', 'five', 'milligrams', 'Zyprexa', 'twice', 'day'], ['Dr.', 'Fein']]}
{2: [['Howard'], ['Fein', "'s", 'one', 'curly', 'hair']]}
{2: [['Right'], ["'s"]]}


In [71]:
del screenplays_nonstop

## 1.10 Convert to Dict structure 

In [72]:
screenplays_dict = cleaned_screenplays.to_dict()
print(screenplays_dict[0][:10])

[{1: [['NIGHT', 'ROXBURY']]}, {2: [['written', 'Steve', 'Koren', 'Ferrell', 'Chris', 'Kattan', 'June']]}, {0: [['PANORAMIC', 'VIEW', 'SUNSET']]}, {1: [['hear', 'Love', 'HADDAWAY', 'night', 'falls', 'partytime', 'begins']]}, {0: [['SUPERIMPOSE', 'SUNSET', 'BLVD.', 'PM']]}, {0: [['DANCE', 'CLUBS', 'NIGHT']]}, {2: [['Coconut', 'Teaser', 'Palace', 'Roxbury', 'Tatou', 'etc']]}, {0: [['DANCE', 'CLUBS-', 'QUICK', 'SHOTS', 'NIGHT']]}, {1: [['random', 'dancers', 'gyrating', 'flirting', 'making', 'drinking']]}, {0: [['PALACE', 'NIGHT']]}]


In [77]:
print(type(screenplays_dict[0]))
print(type(screenplays_dict[0][0]))
print(type(list(screenplays_dict[0][0].values())[0]))


<class 'list'>
<class 'dict'>
<class 'list'>


In [79]:
del cleaned_screenplays

NameError: name 'screenplay_sents' is not defined

# 2. Lemmatization and NER with Spacy

In [78]:
import spacy 
nlp = spacy.load("en_core_web_sm")

## 2.1 Named Entity Recognition

### 2.1.1 Allowed Entities

In [82]:
allowed_entities = [
    "DATE",
    "TIME",
    "QUANTITY",
    "MONEY",
    "GPE",
    "WORK_OF_ART",
    "FAC",
    "LOC",
    "EVENT"
]

### 2.1.2 Create Corpus for Spacy

In [81]:
# define a function to join json data into a corpus 
def join_json(data):
    # empty list for storing joined lines (one line per dict)
    joined_lines = []
    # iterate through dicts
    for d in data:
        # unpack keys and values
        for key, value in d.items():
            # convert key to string label with an escape char
            label = '@' + str(key) + ':'
            # append label to corpus
            joined_lines.append(label)
            # create an empty list for joined sentences 
            joined_sentences = []
            # iterate through sentences in value
            for sentence in value:
                # join the sentences with " " 
                joined_sentence = " ".join(sentence)
                # append joined_sentence to joined_sentences
                joined_sentences.append(joined_sentence)
            # now join the sentences in joined_sentences with ". "
            sentences_in_line = ". ".join(joined_sentences)
            # append this line to the joined_lines list
            joined_lines.append(sentences_in_line)
    # now join all the lines in joined_lines with "\n"
    screenplay_text = " \n ".join(joined_lines)
    # and return the text
    return screenplay_text


In [83]:
allowed_postags = [
    "NOUN",
    "ADJ",
    "VERB",
    "INTJ",
    "ADV"
]

In [84]:
def lemmatize(doc, allowed_postags=allowed_postags):
    lemmas = [] 
    for token in doc:
        if token.pos_ in allowed_postags:
            lemmas.append(token.lemma_)
    return lemmas 

In [88]:
def preprocess_json(json_data, allowed_entities=allowed_entities, allowed_postags=allowed_postags):
    text = join_json(json_data)
    doc = nlp(text)
    filtered_output = []
    for token in doc: 
        if token.ent_type_:
            t = token.text.lower()
            filtered_output.append(t)
        else:
            if token.pos_ in allowed_postags:
                t = token.lemma_.lower()
                filtered_output.append(t)
    return filtered_output

In [89]:
from tqdm.notebook import tqdm 

processed_screenplays_dict = {}

for key, value in tqdm(screenplays_dict.items(), desc="Processing Sreenplays"):
    processed_screenplays_dict[key] = preprocess_json(value)

Processing Sreenplays:   0%|          | 0/1995 [00:00<?, ?it/s]

In [90]:
import pickle 

path = r"C:\\Users\bened\DataScience\ANLP\AT2\\preprocessed_data\\processed_screenplays_final.pkl"

screenplay_pickle = pickle.dumps(processed_screenplays_dict)

with open(path, 'wb') as f:
    f.write(screenplay_pickle)