In [2]:
import re
import pandas as pd

In [1]:
with open('texts/richard-iii_TXT_FolgerShakespeare.txt', 'r') as file:
    rich = file.read()

In [3]:
# extracting just the play text
act_1_index = rich.find("ACT 1")
rich = rich[act_1_index:]

# remove all stage direction (enclosed in [])
rich = re.sub(r'\[.*?\]', '', rich, flags=re.DOTALL)

In [4]:
# finding all character names which indicate the start of the line
# in this text when a character starts speaking their name is put into full caps 
# thus finding all full caps strings will give a list of characters
names = re.findall(r'\b[A-Z]{2,}(?: [A-Z]+)?\b', rich)
rich_char_names = []
[rich_char_names.append(x) for x in names if x not in rich_char_names]
print(rich_char_names)

['ACT', 'RICHARD', 'CLARENCE', 'BRAKENBURY', 'HASTINGS', 'ANNE', 'GENTLEMAN', 'RIVERS', 'GREY', 'QUEEN ELIZABETH', 'BUCKINGHAM', 'STANLEY', 'QUEEN MARGARET', 'DORSET', 'CATESBY', 'MURDERER', 'MURDERERS', 'KEEPER', 'FIRST MURDERER', 'SECOND MURDERER', 'BOTH', 'KING EDWARD', 'BOY', 'DUCHESS', 'DAUGHTER', 'CHILDREN', 'FIRST CITIZEN', 'SECOND CITIZEN', 'THIRD CITIZEN', 'ARCHBISHOP', 'YORK', 'MESSENGER', 'PRINCE', 'MAYOR', 'CARDINAL', 'PURSUIVANT', 'PRIEST', 'VAUGHAN', 'RATCLIFFE', 'ELY', 'LOVELL', 'SCRIVENER', 'ALL', 'PAGE', 'TYRREL', 'FIRST MESSENGER', 'SECOND MESSENGER', 'THIRD MESSENGER', 'FOURTH MESSENGER', 'CHRISTOPHER', 'SHERIFF', 'RICHMOND', 'OXFORD', 'HERBERT', 'BLUNT', 'SURREY', 'NORFOLK', 'GHOST OF', 'EDWARD', 'HENRY', 'GHOSTS OF', 'PRINCES', 'LORDS', 'LORD']


In [5]:
# split on double new line (this should be for new character speaking)
rich = rich.split("\n\n")
#rich = rich.split("\n")

In [6]:
# for each line replace any new lines with spaces
# remove the act breaks
# remove leading or trailing white space
for i in range(len(rich)):
    text = rich[i]
    text = text.replace("\n", " ")
    text = text.replace("=", "")
    rich[i] = text.strip()

# remvove any chunks which are empty from this cleaning
rich = [text for text in rich if text != ""]

In [11]:
act = 0
scene = 0
sentance_number = 0
speaker = ""
last_speaker = ""


# splitting the text by sentence
# sentence end are . ! ?
df = pd.DataFrame(columns=['act', 'scene', 'sentence_number', 'speaker', 'text'])


# character is always frist word in the chunk
def character_speaking(text, last_speaker):
    # splits the text into individual words
    words = text.split()
    if words:
        if (words[0].isupper() and (words[0] not in ["A", "I", "O", "A,", "O,", "I,", "I'"])):
            # removing the comma from the end of the speaker if it is present
            # (leftover from removing stage names)
            speaker = words[0].replace(",", "").strip()
                
            # accounting for speakers who have 2 word names (FIRST SENATOR)
            if (words[1].isupper() and words[1] not in ["A", "I", "O", "A,", "O,", "I,", "I'"]):
                # attach second part of name to speaker
                speaker = speaker + " " + words[1].replace(",", "").strip()

                if(words[2].isupper() and words[2] not in ["A", "I", "O", "A,", "O,", "I,", "I'"]):
                    speaker = speaker + " " + words[2].replace(",", "").strip()
                    return speaker, " ".join(words[2:])
                # returns speaker and the rest of the line rejoined
                return speaker, " ".join(words[2:])
            else:
                # returns the speaker and the rest of the line rejoined
                return speaker, " ".join(words[1:])
        else:
            return last_speaker, " ".join(words)

    return "error something's wrong!"

# for each line in richard iii
for text in rich:
    # if the line is in for "ACT i" we record that it is the new act
    if (text.startswith("ACT") and text[-1].isdigit):
            act = int(text[-1])
    # if the line is "Scene i" we record that it is a new scene
    # reset the sentance counter to 0
    elif (text.startswith("Scene") and text[-1].isdigit):
        #print(text)
        scene = int(text[-1])
        sentance_number = 0
    else:
        # get the speaker from the line
        speaker, spoken = character_speaking(text, last_speaker)
        last_speaker = speaker

        # split the line into sentences
        # I am counting the end of a sentace as . ? or !
        lines = re.split(r"(?<!\w\.\w)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s", spoken)

        # for each sentance in the line
        for line in lines:
            # increment the sentence counter
            sentance_number += 1
            # add it to the dataframe with act, scene, setence number, and speaker attached
            df.loc[len(df)] = [act, scene, sentance_number, speaker, line]

df.to_csv("richardiii_by_sentence.csv")

In [12]:
df["speaker"].unique()

array(['RICHARD', 'CLARENCE', 'BRAKENBURY', 'HASTINGS', 'ANNE',
       'GENTLEMAN', 'RIVERS', 'GREY', 'QUEEN ELIZABETH', 'BUCKINGHAM',
       'STANLEY', 'QUEEN MARGARET', 'DORSET', 'CATESBY', 'MURDERER',
       'MURDERERS', 'KEEPER', 'FIRST MURDERER', 'SECOND MURDERER', 'BOTH',
       'KING EDWARD', 'BOY', 'DUCHESS', 'DAUGHTER', 'CHILDREN',
       'FIRST CITIZEN', 'SECOND CITIZEN', 'THIRD CITIZEN', 'ARCHBISHOP',
       'YORK', 'MESSENGER', 'PRINCE', 'MAYOR', 'CARDINAL', 'PURSUIVANT',
       'PRIEST', 'VAUGHAN', 'RATCLIFFE', 'ELY', 'LOVELL', 'SCRIVENER',
       'ALL', 'PAGE', 'TYRREL', 'FIRST MESSENGER', 'SECOND MESSENGER',
       'THIRD MESSENGER', 'FOURTH MESSENGER', 'CHRISTOPHER', 'SHERIFF',
       'RICHMOND', 'OXFORD', 'HERBERT', 'BLUNT', 'SURREY', 'NORFOLK',
       'GHOST OF EDWARD', 'GHOST OF HENRY', 'GHOST OF CLARENCE',
       'GHOST OF RIVERS', 'GHOST OF GREY', 'GHOST OF VAUGHAN',
       'GHOSTS OF PRINCES', 'GHOST OF HASTINGS', 'GHOST OF ANNE',
       'GHOST OF BUCKINGHAM', 'LO

In [13]:
df = pd.DataFrame(columns=['act', 'scene', 'sentence_number', 'speaker', 'text'])

act = 0
scene = 0
sentance_number = 0

last_speaker = ""
line = ""


# character is always frist word in the chunk
def character_speaking_2(text, last_speaker):
    # splits the text into individual words
    words = text.split()

    if (len(words) > 0):
        
        if (words[0].isupper() and (words[0] not in ["A", "I", "O", "A,", "O,", "I,", "I'"])):
            # removing the comma from the end of the speaker if it is present
            # (leftover from removing stage names)
            speaker = words[0].replace(",", "").strip()
                
            # accounting for speakers who have 2 word names (FIRST SENATOR)
            if (words[1].isupper() and words[1] not in ["A", "I", "O", "A,", "O,", "I,", "I'"]):
                # attach second part of name to speaker
                speaker = speaker + " " + words[1].replace(",", "").strip()

                if (words[2].isupper() and words[2] not in ["A", "I", "O", "A,", "O,", "I,", "I'"]):
                    speaker = speaker + " " + words[2].replace(",", "").strip()
                    return speaker, " ".join(words[3:])

                # some lines are broken up by newline from pervious
                # in case the speaker is the last speaker
                # returns speaker and the rest of the line rejoined
                return speaker, " ".join(words[2:])
            else:
                # returns the speaker and the rest of the line rejoined
                return speaker, " ".join(words[1:])
        else:
            return last_speaker, " ".join(words)

    return "error something's wrong!"


for text in rich:
    # if the line is in for "ACT i" we record that it is the new act
    if (text.startswith("ACT") and text[-1].isdigit):
        act = int(text[-1])
        # if the line is "Scene i" we record that it is a new scene
        # reset the sentance counter to 0
    elif (text.startswith("Scene") and text[-1].isdigit):
        scene = int(text[-1])
        sentance_number = 0
    else:
        #print(character_speaking_2(text, last_speaker))
        speaker, spoken = character_speaking_2(text, last_speaker)
        
        if speaker != last_speaker:
            # increment the sentence counter
            sentance_number += 1
            # add it to the dataframe with act, scene, setence number, and speaker attached
            df.loc[len(df)] = [act, scene, sentance_number, speaker, spoken.strip()]
            last_speaker = speaker
        else:
            text = df.iloc[len(df)-1, 4]
            df.iloc[len(df)-1, 4] = text + spoken
            

df.to_csv("richardiii_by_uninterupted_speaker.csv")

In [14]:
df["speaker"].unique()

array(['RICHARD', 'CLARENCE', 'BRAKENBURY', 'HASTINGS', 'ANNE',
       'GENTLEMAN', 'RIVERS', 'GREY', 'QUEEN ELIZABETH', 'BUCKINGHAM',
       'STANLEY', 'QUEEN MARGARET', 'DORSET', 'CATESBY', 'MURDERER',
       'MURDERERS', 'KEEPER', 'FIRST MURDERER', 'SECOND MURDERER', 'BOTH',
       'KING EDWARD', 'BOY', 'DUCHESS', 'DAUGHTER', 'CHILDREN',
       'FIRST CITIZEN', 'SECOND CITIZEN', 'THIRD CITIZEN', 'ARCHBISHOP',
       'YORK', 'MESSENGER', 'PRINCE', 'MAYOR', 'CARDINAL', 'PURSUIVANT',
       'PRIEST', 'VAUGHAN', 'RATCLIFFE', 'ELY', 'LOVELL', 'SCRIVENER',
       'ALL', 'PAGE', 'TYRREL', 'FIRST MESSENGER', 'SECOND MESSENGER',
       'THIRD MESSENGER', 'FOURTH MESSENGER', 'CHRISTOPHER', 'SHERIFF',
       'RICHMOND', 'OXFORD', 'HERBERT', 'BLUNT', 'SURREY', 'NORFOLK',
       'GHOST OF EDWARD', 'GHOST OF HENRY', 'GHOST OF CLARENCE',
       'GHOST OF RIVERS', 'GHOST OF GREY', 'GHOST OF VAUGHAN',
       'GHOSTS OF PRINCES', 'GHOST OF HASTINGS', 'GHOST OF ANNE',
       'GHOST OF BUCKINGHAM', 'LO