# 0. Import and Structure Data

Starting out from JSON data this time. 

In [1]:
import json
import numpy as np

path = r"C:\\Users\bened\DataScience\ANLP\AT2\\preprocessed_data\\cleaned_screenplays.json"
with open(path, 'r', encoding='utf-8') as f:
    screenplay_data = json.load(f)

screenplay_data = {int(k): v for k, v in screenplay_data.items()}

In [4]:
# check for data corruption 
import re 

allowed = {'@', '.', ':', '\n', '-'}
filtered_tokens = set()

for value in screenplay_data.values():
    for d in value:
        for sentences in d.values():
            for sent in sentences:
                for token in sent:
                    if any(char for char in token if re.search(r'\W', char) and char not in allowed):
                        filtered_tokens.add(token)

print(filtered_tokens)



In [5]:
print(len(filtered_tokens))

83470


In [6]:
print(filtered_tokens[:10])

TypeError: 'set' object is not subscriptable

In [7]:
print(screenplay_data[0])

[{'1': [['night', 'roxbury']]}, {'2': [['written', 'steve', 'koren', 'ferrell', 'chris', 'kattan', 'june']]}, {'0': [['panoramic', 'view', 'sunset']]}, {'1': [['hear', 'love', 'haddaway', 'night', 'falls', 'partytime', 'begins']]}, {'0': [['superimpose', 'sunset', 'blvd.', 'pm']]}, {'0': [['dance', 'clubs', 'night']]}, {'2': [['coconut', 'teaser', 'palace', 'roxbury', 'tatou', 'etc']]}, {'0': [['dance', 'clubs-', 'quick', 'shots', 'night']]}, {'1': [['random', 'dancers', 'gyrating', 'flirting', 'making', 'drinking']]}, {'0': [['palace', 'night']]}, {'1': [['camera', 'moves', 'crowded', 'dance', 'floor', 'settles', 'rhythmically', 'swaying', 'backs']]}, {'1': [['heroes'], ['minds', 'steve', 'tall', 'dark', 'handsome', 'doug', 'little', 'genius'], ['neither', 'correct']]}, {'2': [['except', 'tall', 'little', 'part']]}, {'1': [['simultaneously', 'turn', 'scope', 'room'], ['unison', 'heads', 'bop', 'music'], ['doug', 'steps']]}, {'2': [['bar']]}, {'2': [['o.s'], ['female', 'hey'], ['want',

In [8]:
def convert_nested_ints(data, keys):
    # iterating throug top level keys and lists (each series row)
    for top_key, list in data.items():
        # iterating through each dict in list
        for d in list:
            # check if each input key is in the dict
            for key in keys:
                key_str = str(key)
                # check if the key in the passed key list
                if key_str in d:
                    d[key] = d.pop(key_str)

convert_nested_ints(screenplay_data, np.array([0, 1, 2], dtype=np.int8))

In [9]:
print(type(screenplay_data))
print(type(screenplay_data[0]))
print(type(screenplay_data[0][0]))

<class 'dict'>
<class 'list'>
<class 'dict'>


In [10]:
top_key = list(screenplay_data.keys())[0]
first_dict = screenplay_data[top_key][0]
first_sub_key = list(first_dict.keys())[0]
print(type(first_sub_key))

<class 'numpy.int8'>


## Lemmatization and NER with Spacy 

In [11]:
import spacy 

nlp = spacy.load("en_core_web_sm", disable=["parser"])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

## Named Entity Recognition

First we want to get a sense of what Named Entities will be recognized in a screnplay.  So we'll take 'Night at the Roxbury' and perform NER on that. 

In [18]:
import copy

rox = copy.deepcopy(screenplay_data[0])
print(rox[:10])

[{np.int8(1): [['night', 'roxbury']]}, {np.int8(2): [['written', 'steve', 'koren', 'ferrell', 'chris', 'kattan', 'june']]}, {np.int8(0): [['panoramic', 'view', 'sunset']]}, {np.int8(1): [['hear', 'love', 'haddaway', 'night', 'falls', 'partytime', 'begins']]}, {np.int8(0): [['superimpose', 'sunset', 'blvd.', 'pm']]}, {np.int8(0): [['dance', 'clubs', 'night']]}, {np.int8(2): [['coconut', 'teaser', 'palace', 'roxbury', 'tatou', 'etc']]}, {np.int8(0): [['dance', 'clubs-', 'quick', 'shots', 'night']]}, {np.int8(1): [['random', 'dancers', 'gyrating', 'flirting', 'making', 'drinking']]}, {np.int8(0): [['palace', 'night']]}]


In [13]:
# define a function to join json data into a corpus 
def join_json(json_data):
    # empty list for storing joined lines (one line per dict)
    joined_lines = []
    # iterate through dicts
    for d in json_data:
        # unpack keys and values
        for key, value in d.items():
            # convert key to string label with an escape char
            label = '@' + str(key) + ':'
            # append label to corpus
            joined_lines.append(label)
            # create an empty list for joined sentences 
            joined_sentences = []
            # iterate through sentences in value
            for sentence in value:
                # join the sentences with " " 
                joined_sentence = " ".join(sentence)
                # append joined_sentence to joined_sentences
                joined_sentences.append(joined_sentence)
            # now join the sentences in joined_sentences with ". "
            sentences_in_line = ". ".join(joined_sentences)
            # append this line to the joined_lines list
            joined_lines.append(sentences_in_line)
    # now join all the lines in joined_lines with "\n"
    screenplay_text = " \n ".join(joined_lines)
    # and return the text
    return screenplay_text


In [24]:
# beta test the function
rox_text = join_json(rox)
print(rox_text[:100])

@1:
night roxbury
@2:
written steve koren ferrell chris kattan june
@0:
panoramic view sunset
@1:
he


In [25]:
# now we'll try NER on rox_text
rox_doc = nlp(rox_text)

In [27]:
rox_ents = {}
for entity in rox_doc.ents:
    if entity.label_ in rox_ents:
        rox_ents[entity.label_].append(entity.text)
    else:
        rox_ents[entity.label_] = [entity.text]

dict_keys(['PERSON', 'DATE', 'TIME', 'CARDINAL', 'ORDINAL', 'ORG', 'QUANTITY', 'MONEY', 'GPE', 'NORP', 'FAC', 'LOC', 'LAW', 'PRODUCT', 'PERCENT'])


In [28]:
for key in rox_ents.keys():
    print(key)

PERSON
DATE
TIME
CARDINAL
ORDINAL
ORG
QUANTITY
MONEY
GPE
NORP
FAC
LOC
LAW
PRODUCT
PERCENT


In [33]:
# print first ten entries for each label 
for label, entity in rox_ents.items():
    print(f"Entity: {label}")
    print(f"First 10 entities: {entity[:10]}")
    print("-" * 40)

Entity: PERSON
First 10 entities: ['steve koren', 'chris kattan', 'steve tall dark', 'unison', 'steve', 'steve', 'doug', 'steve', 'doug steve', 'jerkoff\n@2:']
----------------------------------------
Entity: DATE
First 10 entities: ['june', 'next two', 'plus month', 'two years', 'one days', 'days', 'today', '80s', 'fifties', 'saturday']
----------------------------------------
Entity: TIME
First 10 entities: ['night', 'night', 'night', 'tonight', 'later night', 'tonight', 'half hour', 'night', 'last night', 'afternoon']
----------------------------------------
Entity: CARDINAL
First 10 entities: ['two', 'two', 'five', 'one', 'five', 'fifty', 'five', 'one', 'two', 'one']
----------------------------------------
Entity: ORDINAL
First 10 entities: ['second', 'second', 'second', 'second', 'first', 'first', 'first', 'first', 'first', 'second']
----------------------------------------
Entity: ORG
First 10 entities: ['bmw', 'min-mart', 'red licorice reaches frame', 'tenille', 'troll', 'butab

Now we'll make a corpus out of all the data, and do NER for this 

In [46]:
keys = list(screenplay_data.keys())
print(keys[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [47]:
print(type(keys[0]))

<class 'int'>


In [14]:
# empty list for whole corpus 
corpora_1 = []
# iterate through json and apply join_json
for idx, data in screenplay_data.items():
    # break when idx==100
    if idx == 10:
        break
    else:
        # append idx to corpora
        index_str = "#" + str(idx)
        corpora_1.append(index_str)
        text = join_json(data)
        corpora_1.append(text)

# join corpora together with "\n\n\n"
corpus_1 = "\n\n\n".join(corpora_1)

In [23]:
# search corpora_1 for corrupted data 
corruption_pattern = re.compile(r'\\x[0-9a-fA-F]{2}')
corrupted = re.findall(corruption_pattern, repr(corpus_1))

In [24]:
print(corrupted)

['\\x92', '\\x92', '\\x92', '\\x93', '\\x93', '\\x93', '\\x92', '\\x94', '\\x93', '\\x92', '\\x92']


In [22]:
print(len(corrupted))

0


In [51]:
# wrap whole corpus in spacy doc
corpus_doc1 = nlp(corpus_1)

In [52]:
def find_entities(doc):
    ents = {}
    for entity in doc.ents:
        if entity.label_ in ents:
            ents[entity.label_].append(entity.text)
        else:
            ents[entity.label_] = [entity.text]
    return ents

In [54]:
ents10 = find_entities(corpus_doc1)

import random 

def show_ent_sample(ents_dict, n=10):
    for label, entity in ents_dict.items():
        print(f"Entity: {label}")
        if len(entity) >= n:
            ents_sample = random.sample(entity, n)
        else:
            ents_sample = entity
        print(f"Sample of {n}: {ents_sample}")
        print("-" * 50)

show_ent_sample(ents10)

Entity: PERSON
Sample of 10: ['elliot', 'kramer', 'carl', 'wallace', 'knocks head rock', 'anderson', 'footsteps', 'zadir', 'william looks', "jim. 's"]
--------------------------------------------------
Entity: DATE
Sample of 10: ['macaroon', 'couple days', 'next wednesday', 'april', 'june', 'tomorrow', 'first day', 'august', 'yesterday', 'four months ago']
--------------------------------------------------
Entity: TIME
Sample of 10: ['night', 'evening', "almost twenty minutes 's", 'morning', 'night', 'night', 'night', 'room morning', 'tonight', 'night']
--------------------------------------------------
Entity: CARDINAL
Sample of 10: ['four', 'one', 'two', 'one', 'two', 'one', '15int', 'one', 'two', 'two']
--------------------------------------------------
Entity: ORDINAL
Sample of 10: ['first', 'fifth', 'first', 'second', 'fifth', 'first', 'first', 'second', '5th', 'second']
--------------------------------------------------
Entity: ORG
Sample of 10: ['blacksmith', 'virgil misses hits

In [103]:
show_ent_sample(ents10, 20)

Entity: PERSON
Sample of 20: ['elliot', 'wallace kneels', 'lee frederick', 'william.', 'larry', 'amy', 'peel', 'steve', 'steve doug', 'delacroix', 'shot.', "duncan roswell 's", 'kick mantan sings', 'jim plus', 'boodles lobby', 'carl john kapelos\n@2:', 'steve doug', 'norma', 'kramer apt', 'jeffrey\n@2:']
--------------------------------------------------
Entity: DATE
Sample of 20: ['three months', 'tuesday', 'today', 'nineteen-year-old', 'next months', 'april', 'january sixth', 'late day', 'weeks later', 'saturday', 'quarter', 'six months', 'two-year-old', 'late day', 'eight years', '200int. jag day', 'weekend', 'saturday shabbas', 'sunday', 'fifteen-year- old']
--------------------------------------------------
Entity: TIME
Sample of 20: ['twenty minutes', 'every fifteen seconds', 'last night', 'every morning', 'next morning', 'tonight', 'night', 'late tonight', 'morning', 'fifteen minutes', 'night', 'five minutes', 'morning', 'night', 'twenty-four hours', 'afternoon', 'seven eight ev

At a glance: 

- PERSON entities can probably be excluded. 
- MONEY contains swear words so obviously stays in 
- QUANTITY doesn't from this sample but easily could e.g. "two hundred fucking pounds" 
- CARDINAL can be excluded 
- ORDINAL can probably be excluded 
- TIME could possibly be relevant to context, e.g. are adult films more likely to be set at night? 

In [101]:
allowed_entities = set(allowed_entities)
allowed_entities

{'DATE', 'GPE', 'MONEY', 'QUANTITY', 'TIME', 'WORK_OF_ART'}

In [104]:
allowed_entities = list(allowed_entities)
more_ents = [
    "FAC",
    "LOC",
    "EVENT"
]
for ent in more_ents:
    allowed_entities.append(ent)

allowed_entities = set(allowed_entities)
allowed_entities

{'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LOC',
 'MONEY',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [94]:
def filter_entities_v3(doc, allowed_entities=allowed_entities):

    filtered_spans = []

    for ent in doc.ents:
        if ent.label_ in allowed_entities:
            filtered_spans.append(ent)

    filtered_tokens = [
        token for token in doc if not token.ent_type_ or token.ent_type_ in allowed_entities
    ]

    return filtered_tokens

In [97]:
mohicans_filtered = filter_entities_v3(mohicans_doc)
print([token.text for token in mohicans_filtered])
filtered_ents = [
    ent for ent in mohicans_doc.ents if ent.label_ in allowed_entities
]
print([(ent.text, ent.label_) for ent in filtered_ents])


['@0', ':', '\n ', '\n ', '@2', ':', '\n ', 'written', '@1', ':', '\n ', 'screen', 'microcosm', 'leaf', 'crystal', 'drops', 'precipitation', 'stone', 'emerald', 'green', 'moss', '.', "'s", 'landscape', 'miniature', '.', 'hear', 'forest', '.', 'distant', 'birds', '.', 'sound', 'seems', 'reverberate', 'cavern', '.', 'piece', 'sunlight', 'refracts', 'within', 'drops', 'water', 'paints', 'patch', 'moss', 'yellow', '.', 'whisper', 'wind', 'joined', 'another', 'sound', 'mixes', '.', 'distant', 'rustling', '.', 'gets', 'closer', 'louder', '.', "'s", 'shallow', 'breathing', '.', 'gets', 'ominous', '.', "'re", 'interlopers', 'floor', 'forest', 'something', 'coming', '\n ', '@0', ':', '\n ', 'suddenly', 'moccasined', 'foot', '\n ', '@1', ':', '\n ', 'rockets', 'frame', 'scaring', 'us', '\n ', '@0', ':', '\n ', 'extremely', 'close', 'part', 'face', '\n ', '@1', ':', '\n ', 'running', 'hard', '.', 'head', 'shaved', 'bald', 'except', 'scalp', '-', 'lock', '.', '.', "'s", '.', 'seems', 'tall', 'musc

In [84]:
# filter_entities v.2
from spacy.tokens import Span

def filter_entities_v2(doc, allowed_entities):

    spans = []

    for ent in doc.ents:
        if ent.label_ in allowed_entities:
            spans.append(ent)
    
    for token in doc:
        if not token.ent_type_:
            spans.append(Span(doc, token.i, token.i+1))
    
    # Sort spans by token index
    spans = sorted(spans, key=lambda span: span.start)

    filtered_doc = Span(doc, spans[0].start, spans[-1].end).as_doc()

    return filtered_doc 

In [85]:
allowed_entities = set(allowed_entities)
allowed_entities

{'DATE', 'GPE', 'MONEY', 'QUANTITY', 'TIME', 'WORK_OF_ART'}

In [86]:
mohicans_filtered = filter_entities_v2(mohicans_doc, allowed_entities=allowed_entities)
mohicans_ents = find_entities(mohicans_filtered)
show_ent_sample(mohicans_ents, 20)

Entity: PERSON
Sample of 20: ['poe', 'munro', 'hawkeye cora wall', 'jack winthrop', 'cora', 'jeffrey beams', 'duncan', "hudson bay 'd", 'duncan', 'cora', 'unison slam', 'munro', 'ian', 'james. hawkeye goes', 'cora reaches alice grabs', 'phelps', 'cora munro', 'hawkeye little apart', 'hawkeye strips buckskin', 'magua']
--------------------------------------------------
Entity: NORP
Sample of 20: ['americans', 'french', 'british', 'french', 'french', 'mohican', 'french', 'mohican', 'french', 'english', 'french', 'french', 'half-indian', 'french', 'british', 'english', 'indian', 'french', 'english', 'yengeese']
--------------------------------------------------
Entity: CARDINAL
Sample of 20: ['one', 'two', 'two', 'two', 'one', 'thirteen', 'two hundred fifty', 'ten', 'two', 'two', 'half', 'one', 'twenty-five', 'three', 'two', 'two', 'two', 'two-thirds', 'twelve', 'four']
--------------------------------------------------
Entity: DATE
Sample of 20: ['another year', 'two three years', 'seven

- PRODUCT probably useless
- GPE likely useless
- 

In [87]:
print(type(mohicans_doc))

<class 'spacy.tokens.doc.Doc'>


In [88]:
# define a function to filter out entities from a spacy doc 
from spacy.tokens import Span

def filter_entities(doc, allowed_entities):

    spans = []

    for token in doc:
        # if token has an entity type
        if token.ent_type_:
            # and if its type is in allowed_entities 
            if token.ent_type_ in allowed_entities:
                # append to the filtered doc 
                filtered_tokens.append(token)
        else:
            # if token is not an entity, append it to filtered doc
            filtered_tokens.append(token)
    # recreate spacy Doc from the filtered tokens
    filtered_doc = Doc(
        doc.vocab, 
        words=[t.text for t in filtered_tokens],
        spaces=[t.whitespace_ for t in filtered_tokens])
    
    return filtered_doc


allowed entities: DATE could make sense for determining context 

In [89]:
allowed_entities = [
    "DATE",
    "TIME",
    "QUANTITY",
    "MONEY",
    "GPE",
    "WORK_OF_ART"
]

In [90]:
# test the function on mohicans 
## BEFORE filtration

mohicans = copy.deepcopy(screenplay_data[10])
mohicans_text = join_json(mohicans)
# to doc 
mohicans_doc = nlp(mohicans_text)
# find entities 
mohicans_ents = find_entities(mohicans_doc)
# print random sample
show_ent_sample(mohicans_ents, 20)


Entity: PERSON
Sample of 20: ['frozen moment', 'chingachgook', 'george ii', 'grey hair \n ', 'hawkeye', 'hawkeye', 'magua \n ', 'knocks', 'hawkeye', 'grey wolf', 'sun', "magua jams montcalm 's", 'hawkeye', 'john \n ', "john cameron 's", 'gambio', 'alice \n ', 'huron', 'munro \n ', 'cameron jack']
--------------------------------------------------
Entity: NORP
Sample of 20: ['english \n ', 'english', 'french', 'french', 'french', 'english', 'french', 'french', 'french', 'yengeese', 'european', 'french', 'french', 'mohican', 'french', 'french', 'mohican', 'fort french', 'european', 'english']
--------------------------------------------------
Entity: CARDINAL
Sample of 20: ['six', 'one', 'five', 'one', 'four', 'eighteen', 'two', 'two-thirds', 'one', 'one', 'nine', 'half', 'two', 'quarter', 'three', 'two', 'three', 'twenty-five', 'three', 'one']
--------------------------------------------------
Entity: PRODUCT
Sample of 20: ['munro', 'munro', 'munro', 'munro', 'munro', 'munro', 'munro', 

"QUANTITY" could be relevant e.g. "thirteen inch mortars" 
"DATE" could even be relevant e.g. "DOOMSDAY" 

If I had time I would do this before converting chars to lower 

In [91]:
allowed_entities.append("QUANTITY")
allowed_entities.append("DATE")
allowed_entities

['DATE', 'TIME', 'QUANTITY', 'MONEY', 'GPE', 'WORK_OF_ART', 'QUANTITY', 'DATE']

## Comprehensive Preprocessing function 

We want a function that will: 

1. Take an item of screenplay data as input

2. Create a corpus by calling the join_json function 

3. Wrap corpus in a doc 

4. filter out entities not in allowed_entities 

5. lemmatize where pos in allowed_postags 

6. return lemmatized and filtered data 

In [70]:
allowed_postags = [
    "NOUN",
    "ADJ",
    "VERB",
    "INTJ",
    "ADV"
]

In [71]:
def lemmatize(doc, allowed_postags=allowed_postags):

    lemmas = [] 

    for token in doc:
        if token.pos_ in allowed_postags:
            lemmas.append(t.lemma_)
    
    return lemmas 

In [99]:
allowed_entities

['DATE', 'TIME', 'QUANTITY', 'MONEY', 'GPE', 'WORK_OF_ART', 'QUANTITY', 'DATE']

In [98]:
def preprocess_json(json_data, allowed_entities=allowed_entities, allowed_postags=allowed_postags):

    text = join_json(json_data)

    doc = nlp(text)

    filtered_output = []

    for token in doc: 
        if token.ent_type_:
            filtered_output.append(token.text)
        else:
            if token.pos_ in allowed_postags:
                filtered_output.append(token.lemma_)
    
    return filtered_output

In [106]:
del corpora_1, corpus_1, corpus_doc1, data, ents10, filtered_ents, keys, mohicans, mohicans_doc, mohicans_ents, mohicans_filtered, mohicans_text

In [108]:
# ! pip install tqdm



In [None]:
# check for corrupted data 
import re 

allowed = {'@', '.', ':', '\n', '-'}
filtered_tokens = set()
for val in screenplay_data.values():
    

In [109]:
from tqdm.notebook import tqdm


processed_screenplays_dict = {}

for key, value in tqdm(screenplay_data.items(), desc="Preprocessing Screenplays"):
    
    processed_screenplays_dict[key] = preprocess_json(value)

Preprocessing Screenplays:   0%|          | 0/1995 [00:00<?, ?it/s]

In [111]:
print(len(processed_screenplays_dict))

1995


In [112]:
print(processed_screenplays_dict[0][:10])

['@1', 'night', 'write', 'steve', 'koren', 'chris', 'kattan', 'june', 'panoramic', 'view']


In [113]:
path = r"C:\\Users\bened\DataScience\ANLP\AT2\screenplays_preprocessed.json"
with open(path, 'w', encoding='utf-8') as o:
    json.dump(processed_screenplays_dict, o, ensure_ascii=False)