# 0. Import and Structure Data

Starting out from JSON data this time. 

In [1]:
import json
import gzip
import numpy as np

path = r"C:\\Users\bened\DataScience\ANLP\AT2\\preprocessed_data\\cleaned_screenplays.json"
with gzip.open(path, 'rt', encoding='utf-8') as f:
    screenplay_data = json.load(f)

print(type(screenplay_data))

screenplay_data = {int(k): v for k, v in screenplay_data.items()}

<class 'dict'>


In [2]:
# check for data corruption 
import re 

allowed = {'@', '.', ':', '\n', '-'}
filtered_tokens = set()

for value in screenplay_data.values():
    for d in value:
        for sentences in d.values():
            for sent in sentences:
                for token in sent:
                    if any(char for char in token if re.search(r'\W', char) and char not in allowed):
                        filtered_tokens.add(token)

print(filtered_tokens)



TODO: 
- remove ' 
- convert '\w/\w' to '_'
- remove hexadecimals 
- sub '\w(.)\w' with '\w \w'
- remove ~ 
- sub '-' with '_' (recognizable bigram)


In [3]:
hex_pat = re.compile(r'[\x00-\x1F\x7F-\x9F]')

def clean_corruptions(string):
    string = re.sub("'", ' ', string)
    string = re.sub(r'(\w)/(\w)', r'\1_\2', string)
    string = re.sub(r'(\w)[^\w\s](\w)', r'\1 \2', string)
    string = re.sub("/", "_", string)
    string = re.sub("-—", '', string)
    string = re.sub(hex_pat, '', string)
    string = re.sub("~", '', string)
    string = re.sub("-", "_", string)
    if string:
        return string

In [4]:
test_string = "he'll a/b~\x13c-c-d"
print(clean_corruptions(test_string))

hell a_bc c_d


In [5]:
for value in screenplay_data.values():
    for d in value:
        for sentences in d.values():
            for sent in sentences:
                for token in sent:
                    token = clean_corruptions(token)

In [6]:
print(len(filtered_tokens))

35874


In [7]:
print(screenplay_data[0])

[{'1': [['night', 'roxbury']]}, {'2': [['written', 'steve', 'koren', 'ferrell', 'chris', 'kattan', 'june']]}, {'0': [['panoramic', 'view', 'sunset']]}, {'1': [['hear', 'love', 'haddaway', 'night', 'falls', 'partytime', 'begins']]}, {'0': [['superimpose', 'sunset', 'blvd.', 'pm']]}, {'0': [['dance', 'clubs', 'night']]}, {'2': [['coconut', 'teaser', 'palace', 'roxbury', 'tatou', 'etc']]}, {'0': [['dance', 'clubs-', 'quick', 'shots', 'night']]}, {'1': [['random', 'dancers', 'gyrating', 'flirting', 'making', 'drinking']]}, {'0': [['palace', 'night']]}, {'1': [['camera', 'moves', 'crowded', 'dance', 'floor', 'settles', 'rhythmically', 'swaying', 'backs']]}, {'1': [['heroes'], ['minds', 'steve', 'tall', 'dark', 'handsome', 'doug', 'little', 'genius'], ['neither', 'correct']]}, {'2': [['except', 'tall', 'little', 'part']]}, {'1': [['simultaneously', 'turn', 'scope', 'room'], ['unison', 'heads', 'bop', 'music'], ['doug', 'steps']]}, {'2': [['bar']]}, {'2': [['o.s'], ['female', 'hey'], ['want',

In [9]:
def convert_nested_ints(data, keys):
    # iterating throug top level keys and lists (each series row)
    for top_key, list in data.items():
        # iterating through each dict in list
        for d in list:
            # check if each input key is in the dict
            for key in keys:
                key_str = str(key)
                # check if the key in the passed key list
                if key_str in d:
                    d[key] = d.pop(key_str)

convert_nested_ints(screenplay_data, np.array([0, 1, 2], dtype=np.int8))

In [10]:
print(type(screenplay_data))
print(type(screenplay_data[0]))
print(type(screenplay_data[0][0]))

<class 'dict'>
<class 'list'>
<class 'dict'>


In [11]:
top_key = list(screenplay_data.keys())[0]
first_dict = screenplay_data[top_key][0]
first_sub_key = list(first_dict.keys())[0]
print(type(first_sub_key))

<class 'numpy.int8'>


## Lemmatization and NER with Spacy 

In [12]:
# ! python -m spacy download en_core_web_sm

In [13]:
import spacy 

nlp = spacy.load("en_core_web_sm", disable=["parser"])

## Named Entity Recognition

### Allowed Entities

In [14]:
allowed_entities = [
    "DATE",
    "TIME",
    "QUANTITY",
    "MONEY",
    "GPE",
    "WORK_OF_ART"
]

### Entity Analysis

First we want to get a sense of what Named Entities will be recognized in a screnplay.  So we'll take 'Night at the Roxbury' and perform NER on that. 

In [15]:
# import copy

# rox = copy.deepcopy(screenplay_data[0])
# print(rox[:10])

In [16]:
# define a function to join json data into a corpus 
def join_json(json_data):
    # empty list for storing joined lines (one line per dict)
    joined_lines = []
    # iterate through dicts
    for d in json_data:
        # unpack keys and values
        for key, value in d.items():
            # convert key to string label with an escape char
            label = '@' + str(key) + ':'
            # append label to corpus
            joined_lines.append(label)
            # create an empty list for joined sentences 
            joined_sentences = []
            # iterate through sentences in value
            for sentence in value:
                # join the sentences with " " 
                joined_sentence = " ".join(sentence)
                # append joined_sentence to joined_sentences
                joined_sentences.append(joined_sentence)
            # now join the sentences in joined_sentences with ". "
            sentences_in_line = ". ".join(joined_sentences)
            # append this line to the joined_lines list
            joined_lines.append(sentences_in_line)
    # now join all the lines in joined_lines with "\n"
    screenplay_text = " \n ".join(joined_lines)
    # and return the text
    return screenplay_text


In [17]:
# # beta test the function
# rox_text = join_json(rox)
# print(rox_text[:100])

In [18]:
# # now we'll try NER on rox_text
# rox_doc = nlp(rox_text)

In [19]:
# rox_ents = {}
# for entity in rox_doc.ents:
#     if entity.label_ in rox_ents:
#         rox_ents[entity.label_].append(entity.text)
#     else:
#         rox_ents[entity.label_] = [entity.text]

In [20]:
# for key in rox_ents.keys():
#     print(key)

In [21]:
# # print first ten entries for each label 
# for label, entity in rox_ents.items():
#     print(f"Entity: {label}")
#     print(f"First 10 entities: {entity[:10]}")
#     print("-" * 40)

Now we'll make a corpus out of all the data, and do NER for this 

In [22]:
keys = list(screenplay_data.keys())
print(keys[:10])

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [23]:
print(type(keys[0]))

<class 'int'>


In [24]:
# # empty list for whole corpus 
# corpora_1 = []
# # iterate through json and apply join_json
# for idx, data in screenplay_data.items():
#     # break when idx==100
#     if idx == 10:
#         break
#     else:
#         # append idx to corpora
#         index_str = "#" + str(idx)
#         corpora_1.append(index_str)
#         text = join_json(data)
#         corpora_1.append(text)

# # join corpora together with "\n\n\n"
# corpus_1 = "\n\n\n".join(corpora_1)

In [25]:
# # search corpora_1 for corrupted data 
# corruption_pattern = re.compile(r'\\x[0-9a-fA-F]{2}')
# corrupted = re.findall(corruption_pattern, repr(corpus_1))

In [26]:
# print(corrupted)

In [27]:
# print(len(corrupted))

In [28]:
# wrap whole corpus in spacy doc
# corpus_doc1 = nlp(corpus_1)

In [29]:
def find_entities(doc):
    ents = {}
    for entity in doc.ents:
        if entity.label_ in ents:
            ents[entity.label_].append(entity.text)
        else:
            ents[entity.label_] = [entity.text]
    return ents

In [30]:
# ents10 = find_entities(corpus_doc1)

In [31]:

import random 

def show_ent_sample(ents_dict, n=10):
    for label, entity in ents_dict.items():
        print(f"Entity: {label}")
        if len(entity) >= n:
            ents_sample = random.sample(entity, n)
        else:
            ents_sample = entity
        print(f"Sample of {n}: {ents_sample}")
        print("-" * 50)

At a glance: 

- PERSON entities can probably be excluded. 
- MONEY contains swear words so obviously stays in 
- QUANTITY doesn't from this sample but easily could e.g. "two hundred fucking pounds" 
- CARDINAL can be excluded 
- ORDINAL can probably be excluded 
- TIME could possibly be relevant to context, e.g. are adult films more likely to be set at night? 

In [32]:
allowed_entities = list(allowed_entities)
more_ents = [
    "FAC",
    "LOC",
    "EVENT"
]
for ent in more_ents:
    allowed_entities.append(ent)

allowed_entities = set(allowed_entities)
allowed_entities

{'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LOC',
 'MONEY',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [33]:
def filter_entities_v3(doc, allowed_entities=allowed_entities):

    filtered_spans = []

    for ent in doc.ents:
        if ent.label_ in allowed_entities:
            filtered_spans.append(ent)

    filtered_tokens = [
        token for token in doc if not token.ent_type_ or token.ent_type_ in allowed_entities
    ]

    return filtered_tokens

In [34]:
# mohicans_filtered = filter_entities_v3(mohicans_doc)
# print([token.text for token in mohicans_filtered])
# filtered_ents = [
#     ent for ent in mohicans_doc.ents if ent.label_ in allowed_entities
# ]
# print([(ent.text, ent.label_) for ent in filtered_ents])


In [35]:
# filter_entities v.2
from spacy.tokens import Span

def filter_entities_v2(doc, allowed_entities):

    spans = []

    for ent in doc.ents:
        if ent.label_ in allowed_entities:
            spans.append(ent)
    
    for token in doc:
        if not token.ent_type_:
            spans.append(Span(doc, token.i, token.i+1))
    
    # Sort spans by token index
    spans = sorted(spans, key=lambda span: span.start)

    filtered_doc = Span(doc, spans[0].start, spans[-1].end).as_doc()

    return filtered_doc 

In [36]:
allowed_entities = set(allowed_entities)
allowed_entities

{'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LOC',
 'MONEY',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [37]:
# mohicans_filtered = filter_entities_v2(mohicans_doc, allowed_entities=allowed_entities)
# mohicans_ents = find_entities(mohicans_filtered)
# show_ent_sample(mohicans_ents, 20)

- PRODUCT probably useless
- GPE likely useless
- 

In [38]:
# print(type(mohicans_doc))

In [39]:
# define a function to filter out entities from a spacy doc 
from spacy.tokens import Span

def filter_entities(doc, allowed_entities):

    spans = []

    for token in doc:
        # if token has an entity type
        if token.ent_type_:
            # and if its type is in allowed_entities 
            if token.ent_type_ in allowed_entities:
                # append to the filtered doc 
                filtered_tokens.append(token)
        else:
            # if token is not an entity, append it to filtered doc
            filtered_tokens.append(token)
    # recreate spacy Doc from the filtered tokens
    filtered_doc = Doc(
        doc.vocab, 
        words=[t.text for t in filtered_tokens],
        spaces=[t.whitespace_ for t in filtered_tokens])
    
    return filtered_doc


allowed entities: DATE could make sense for determining context 

In [40]:
# # test the function on mohicans 
# ## BEFORE filtration

# mohicans = copy.deepcopy(screenplay_data[10])
# mohicans_text = join_json(mohicans)
# # to doc 
# mohicans_doc = nlp(mohicans_text)
# # find entities 
# mohicans_ents = find_entities(mohicans_doc)
# # print random sample
# show_ent_sample(mohicans_ents, 20)


"QUANTITY" could be relevant e.g. "thirteen inch mortars" 
"DATE" could even be relevant e.g. "DOOMSDAY" 

If I had time I would do this before converting chars to lower 

In [41]:
allowed_entities.add("QUANTITY")
allowed_entities.add("DATE")
allowed_entities

{'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LOC',
 'MONEY',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

## Comprehensive Preprocessing function 

We want a function that will: 

1. Take an item of screenplay data as input

2. Create a corpus by calling the join_json function 

3. Wrap corpus in a doc 

4. filter out entities not in allowed_entities 

5. lemmatize where pos in allowed_postags 

6. return lemmatized and filtered data 

In [42]:
allowed_postags = [
    "NOUN",
    "ADJ",
    "VERB",
    "INTJ",
    "ADV"
]

In [43]:
def lemmatize(doc, allowed_postags=allowed_postags):

    lemmas = [] 

    for token in doc:
        if token.pos_ in allowed_postags:
            lemmas.append(t.lemma_)
    
    return lemmas 

In [44]:
allowed_entities

{'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LOC',
 'MONEY',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART'}

In [45]:
def preprocess_json(json_data, allowed_entities=allowed_entities, allowed_postags=allowed_postags):

    text = join_json(json_data)

    doc = nlp(text)

    filtered_output = []

    for token in doc: 
        if token.ent_type_:
            filtered_output.append(token.text)
        else:
            if token.pos_ in allowed_postags:
                filtered_output.append(token.lemma_)
    
    return filtered_output

In [46]:
# del corpora_1, corpus_1, corpus_doc1, data, ents10, filtered_ents, keys, mohicans, mohicans_doc, mohicans_ents, mohicans_filtered, mohicans_text

NameError: name 'corpora_1' is not defined

In [51]:
# ! pip install tqdm



In [53]:
# ! pip install ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.5-py3-none-any.whl.metadata (2.3 kB)
Collecting widgetsnbextension~=4.0.12 (from ipywidgets)
  Downloading widgetsnbextension-4.0.13-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.12 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
Downloading jupyterlab_widgets-3.0.13-py3-none-any.whl (214 kB)
Downloading widgetsnbextension-4.0.13-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ---------------------- ----------------- 1.3/2.3 MB 8.4 MB/s eta 0:00:01
   ---------------------------------------- 2.3/2.3 MB 7.0 MB/s eta 0:00:00
Installing collected packages: widgetsnbextension, jupyterlab-widgets, ipywidgets
Successfully installed ipywidgets-8.1.5 jupyterlab-widgets-3.0.13 widgetsnbextension-4.0.13


In [47]:
processed_screenplays_dict = {}

for key, value in screenplay_data.items():
    processed_screenplays_dict[key] = preprocess_json(value)

In [48]:
# from tqdm.notebook import tqdm


# processed_screenplays_dict = {}

# for key, value in tqdm(screenplay_data.items(), desc="Preprocessing Screenplays"):
    
#     processed_screenplays_dict[key] = preprocess_json(value)

In [49]:
print(len(processed_screenplays_dict))

1995


In [50]:
print(processed_screenplays_dict[0][:10])

['@1', 'night', 'write', 'steve', 'koren', 'chris', 'kattan', 'june', 'panoramic', 'view']


# Save Results 

In [53]:
import json
import gzip
import os 

def json_safe_save(dict, file_path):
    try:
        with gzip.open(file_path, 'wt', encoding='utf=8') as f:
            json.dump(dict, f, ensure_ascii=False, indent=2)
        print(f"File saved successfully at {file_path}")
    except (OSError, IOError) as e:
        print(f"Error saving JSON: {e}")

path = r"C:\\Users\bened\DataScience\ANLP\AT2\\preprocessed_data\screenplays_preprocessed2.json.gz"

json_safe_save(processed_screenplays_dict, path)

File saved successfully at C:\\Users\bened\DataScience\ANLP\AT2\\preprocessed_data\screenplays_preprocessed2.json.gz
