In [1]:
import PyPDF2
import regex as re

In [2]:
pdf = open('movie_scripts/american_psycho.pdf', 'rb')
pdf = PyPDF2.PdfReader(pdf)

In [3]:
def read_pdf(path_to_pdf: str):
    pdf = open(path_to_pdf, 'rb')
    pdf = PyPDF2.PdfReader(pdf)
    
    print('Path to CSV:', path_to_pdf)
    print('Number of pages of pdf:', len(pdf.pages))
    
    conversations_by_page = []
    
    for p in pdf.pages:
        text = p.extract_text()
        
        #print('Step 1: Remove text within parentheses')
        text_no_parentheses = re.sub(r"\([^)]*\)", "", text)
        
        #print('Step 2: Replace character names surrounded by newlines with <NAME>')
        text_with_names = re.sub(r'(?<=\n)([A-Z]+)(?=\n)', r'<\1>', text_no_parentheses)
        
        #print('Step 3: Remove mid sentence new lines')
        text_no_mid_sentence_newlines = re.sub(r'(?<![.?!])\n(?!\s)', ' ', text_with_names)
        
        #print('Step 4: Remove newlines following a <NAME> pattern')
        text_no_newlines_after_name = re.sub(r'(<[A-Z]+>)\n', r'\1', text_no_mid_sentence_newlines)
        
        #print('Step 5: Keep only the <NAME> followed by the dialogue up until the next newline')
        text_only_name_dialouge = re.sub(r'(<[A-Z]+>.*?)\n', r'\1\n', text_no_newlines_after_name)
        
        #print('Step 6: Remove everything except lines with <NAME> followed by text until newline')
        # This regex keeps only the lines where there's a name followed by dialogue up to the newline
        text_spaces = re.sub(r'^(?!<.*?>).*$', '', text_only_name_dialouge, flags=re.M)
        
        #print('Step 7: Replace multiple spaces with a single space')
        text_final = re.sub(r'\s+', ' ', text_spaces)
        
        #print('Step 8: Add a newline before each <NAME>')
        text_final_with_newlines = re.sub(r'(<[A-Z]+>)', r'\n\1', text_final)
        
        #print('Step 9: Remove the last three characters if they contain a number (1 to 1000) followed by a dot')
        text_final_no_last_number = re.sub(r'(\d{1,3})\.$', '', text_final_with_newlines)
        
        # Remove whitespace elements in list
        page_list = [line for line in text_final_no_last_number.splitlines() if line.strip()]
        
        conversations_by_page.append(page_list)
    
    # Remove empty lists
    conversations_by_page = [el for el in conversations_by_page if el]
         
    return conversations_by_page

In [4]:
test_list = read_pdf('movie_scripts/american_psycho.pdf')

Path to CSV: movie_scripts/american_psycho.pdf
Number of pages of pdf: 121


In [5]:
test_list

[['<WAITER> ...with goat cheese profiteroles and I also have-an.arugula Caesar salad. For entrees tonight I have a swordfish meatloaf with onion marmalade, a rare-roasted partridge breast in raspberry coulis with a sorrel timbale... '],
 ['<MCDERMOTT> What do you mean? ',
  '<PRICE> Yes. Clarify. ',
  '<MCDERMOTT> Well, is it strictly informal- ',
  '<BATEMAN> Or can it be worn with a suit? ',
  '<MCDERMOTT> Exactly. ',
  '<BATEMAN> With discreet pinstripes you should wear a subdued blue or charcoal gray vest. A plaid suit would call for a bolder vest. ',
  "<MCDERMOTT> But avoid matching the vest's pattern with your socks or tie. Wearing argyle socks with an argyle vest will look too studied. "],
 ['<BATEMAN> Van Patten looks puffy. Has he stopped working out? ',
  "<PRICE> It looks that way, doesn't it? ",
  '<MCDERMOTT> Did he just take our plates away? ',
  "<PRICE> He took them away because the portions are so small he probably thought we were finished. God I hate this place. This

In [3]:
print('Number of pages of pdf:', len(pdf.pages))

Number of pages of pdf: 146


In [4]:
single_page = pdf.pages[11]

In [5]:
text = single_page.extract_text()

In [6]:
text

'                                                     11.MADELEINE\nHome.\nEXT. GRAVINA DI PUGLIA BRIDGE - EARLY MORNING 25 25\nAs Bond crosses the ancient bridge spanning a deep ravine, a \nsmall group of WIDOWED WOMEN veiled in black pass by.\nBond looks up to an acropolis silhouetted in the morning sun.\nEXT. ACROPOLIS, KIOSK - MORNING 26 26\nBond approaches the kiosk.\nBOND\nBuongiorno. La tombe del Lynd (Good \nmorning. Lynd plot, please).\nAn OLD MAN, the caretaker, steps out to meet Bond. He barks \nat a YOUNG BOY.\nOLD MAN\n(in dialect)\nTake him to the Lynd grave.\nThe boy waves to Bond to follow him.\nBOY\n(in dialect)\nCome.\nEXT. ACROPOLIS - MORNING 27 27\nThe boy guides Bond to Vesper’s grave then hustles off \nleaving Bond alone to face:\nCU on a portrait of a beautiful WOMAN, dark hair, pale skin, \nlight eyes: VESPER LYND 1983-2006.\nThe image stings his eyes.\nBOND\nI miss you.\nHe takes a piece of paper out of his pocket and lights it on \nfire with a lighter. We see 

In [7]:
# Step 1: Remove text within parentheses
text_no_parentheses = re.sub(r"\([^)]*\)", "", text)

In [8]:
text_no_parentheses

'                                                     11.MADELEINE\nHome.\nEXT. GRAVINA DI PUGLIA BRIDGE - EARLY MORNING 25 25\nAs Bond crosses the ancient bridge spanning a deep ravine, a \nsmall group of WIDOWED WOMEN veiled in black pass by.\nBond looks up to an acropolis silhouetted in the morning sun.\nEXT. ACROPOLIS, KIOSK - MORNING 26 26\nBond approaches the kiosk.\nBOND\nBuongiorno. La tombe del Lynd .\nAn OLD MAN, the caretaker, steps out to meet Bond. He barks \nat a YOUNG BOY.\nOLD MAN\n\nTake him to the Lynd grave.\nThe boy waves to Bond to follow him.\nBOY\n\nCome.\nEXT. ACROPOLIS - MORNING 27 27\nThe boy guides Bond to Vesper’s grave then hustles off \nleaving Bond alone to face:\nCU on a portrait of a beautiful WOMAN, dark hair, pale skin, \nlight eyes: VESPER LYND 1983-2006.\nThe image stings his eyes.\nBOND\nI miss you.\nHe takes a piece of paper out of his pocket and lights it on \nfire with a lighter. We see what he has written: Forgive Me. \nThe embers fall to the g

In [9]:
# Step 2: Replace character names surrounded by newlines with <NAME>
text_with_names = re.sub(r'(?<=\n)([A-Z]+)(?=\n)', r'<\1>', text_no_parentheses)

In [10]:
text_with_names

'                                                     11.MADELEINE\nHome.\nEXT. GRAVINA DI PUGLIA BRIDGE - EARLY MORNING 25 25\nAs Bond crosses the ancient bridge spanning a deep ravine, a \nsmall group of WIDOWED WOMEN veiled in black pass by.\nBond looks up to an acropolis silhouetted in the morning sun.\nEXT. ACROPOLIS, KIOSK - MORNING 26 26\nBond approaches the kiosk.\n<BOND>\nBuongiorno. La tombe del Lynd .\nAn OLD MAN, the caretaker, steps out to meet Bond. He barks \nat a YOUNG BOY.\nOLD MAN\n\nTake him to the Lynd grave.\nThe boy waves to Bond to follow him.\n<BOY>\n\nCome.\nEXT. ACROPOLIS - MORNING 27 27\nThe boy guides Bond to Vesper’s grave then hustles off \nleaving Bond alone to face:\nCU on a portrait of a beautiful WOMAN, dark hair, pale skin, \nlight eyes: VESPER LYND 1983-2006.\nThe image stings his eyes.\n<BOND>\nI miss you.\nHe takes a piece of paper out of his pocket and lights it on \nfire with a lighter. We see what he has written: Forgive Me. \nThe embers fall to

In [11]:
print(text_with_names)

                                                     11.MADELEINE
Home.
EXT. GRAVINA DI PUGLIA BRIDGE - EARLY MORNING 25 25
As Bond crosses the ancient bridge spanning a deep ravine, a 
small group of WIDOWED WOMEN veiled in black pass by.
Bond looks up to an acropolis silhouetted in the morning sun.
EXT. ACROPOLIS, KIOSK - MORNING 26 26
Bond approaches the kiosk.
<BOND>
Buongiorno. La tombe del Lynd .
An OLD MAN, the caretaker, steps out to meet Bond. He barks 
at a YOUNG BOY.
OLD MAN

Take him to the Lynd grave.
The boy waves to Bond to follow him.
<BOY>

Come.
EXT. ACROPOLIS - MORNING 27 27
The boy guides Bond to Vesper’s grave then hustles off 
leaving Bond alone to face:
CU on a portrait of a beautiful WOMAN, dark hair, pale skin, 
light eyes: VESPER LYND 1983-2006.
The image stings his eyes.
<BOND>
I miss you.
He takes a piece of paper out of his pocket and lights it on 
fire with a lighter. We see what he has written: Forgive Me. 
The embers fall to the ground.
Then...          

In [12]:
text_no_mid_sentence_newlines = re.sub(r'(?<![.?!])\n(?!\s)', ' ', text_with_names)

In [13]:
print(text_no_mid_sentence_newlines)

                                                     11.MADELEINE Home.
EXT. GRAVINA DI PUGLIA BRIDGE - EARLY MORNING 25 25 As Bond crosses the ancient bridge spanning a deep ravine, a  small group of WIDOWED WOMEN veiled in black pass by.
Bond looks up to an acropolis silhouetted in the morning sun.
EXT. ACROPOLIS, KIOSK - MORNING 26 26 Bond approaches the kiosk.
<BOND> Buongiorno. La tombe del Lynd .
An OLD MAN, the caretaker, steps out to meet Bond. He barks  at a YOUNG BOY.
OLD MAN
 Take him to the Lynd grave.
The boy waves to Bond to follow him.
<BOY>
 Come.
EXT. ACROPOLIS - MORNING 27 27 The boy guides Bond to Vesper’s grave then hustles off  leaving Bond alone to face: CU on a portrait of a beautiful WOMAN, dark hair, pale skin,  light eyes: VESPER LYND 1983-2006.
The image stings his eyes.
<BOND> I miss you.
He takes a piece of paper out of his pocket and lights it on  fire with a lighter. We see what he has written: Forgive Me.  The embers fall to the ground.
Then...          

In [14]:
# Step 1: Remove newlines following a <NAME> pattern
text_no_newlines_after_name = re.sub(r'(<[A-Z]+>)\n', r'\1', text_no_mid_sentence_newlines)

In [15]:
print(text_no_newlines_after_name)

                                                     11.MADELEINE Home.
EXT. GRAVINA DI PUGLIA BRIDGE - EARLY MORNING 25 25 As Bond crosses the ancient bridge spanning a deep ravine, a  small group of WIDOWED WOMEN veiled in black pass by.
Bond looks up to an acropolis silhouetted in the morning sun.
EXT. ACROPOLIS, KIOSK - MORNING 26 26 Bond approaches the kiosk.
<BOND> Buongiorno. La tombe del Lynd .
An OLD MAN, the caretaker, steps out to meet Bond. He barks  at a YOUNG BOY.
OLD MAN
 Take him to the Lynd grave.
The boy waves to Bond to follow him.
<BOY> Come.
EXT. ACROPOLIS - MORNING 27 27 The boy guides Bond to Vesper’s grave then hustles off  leaving Bond alone to face: CU on a portrait of a beautiful WOMAN, dark hair, pale skin,  light eyes: VESPER LYND 1983-2006.
The image stings his eyes.
<BOND> I miss you.
He takes a piece of paper out of his pocket and lights it on  fire with a lighter. We see what he has written: Forgive Me.  The embers fall to the ground.
Then...           

In [16]:
# Step 1: Keep only the <NAME> followed by the dialogue up until the next newline
text_only_name_dialouge = re.sub(r'(<[A-Z]+>.*?)\n', r'\1\n', text_no_newlines_after_name)

In [17]:
# Step 2: Remove everything except lines with <NAME> followed by text until newline
# This regex keeps only the lines where there's a name followed by dialogue up to the newline
text_spaces = re.sub(r'^(?!<.*?>).*$', '', text_only_name_dialouge, flags=re.M)

In [18]:
# Step 3: Replace multiple spaces with a single space
text_final = re.sub(r'\s+', ' ', text_spaces)

In [19]:
# Output the cleaned text
print(text_final)

 <BOND> Buongiorno. La tombe del Lynd . <BOY> Come. <BOND> I miss you. 
