In [52]:
import re


def read_markdown_file(file_path):
    """Reads a markdown file and returns its content as a string."""
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            markdown_content = file.read()
        return markdown_content
    except FileNotFoundError:
        return f"Error: File not found at path: {file_path}"
    except Exception as e:
        return f"An error occurred: {e}"

# Example usage
file_path = 'season_1_episode_1.md'
markdown_text = read_markdown_file(file_path)

In [66]:
def remove_audio_lines(split_text):
    # remove all lines between 'Audio Player' and 'Use Up/Down Arrow keys to increase or decrease volume."
    start_phrase = "Audio Player"
    end_phrase = "Use Up/Down Arrow keys to increase or decrease volume." # not using for now - just use gap count
    max_phrase_gap = 4
    remove_phrase_indices = []
    phrase_count = -1
    for i, phrase in enumerate(split_text):
        # look for start_phrase
        if phrase == start_phrase:
            phrase_count = 0

        # record index if in phrase gap
        if phrase_count >= 0:
            phrase_count += 1
            remove_phrase_indices.append(i)
        
        # reset phrase_count once gap is reched
        if phrase_count == max_phrase_gap:
            phrase_count = -1

    # remove unwanted audio player phrases based on detected indices above
    split_text = [split_text[i] for i in range(len(split_text)) if i not in remove_phrase_indices]
    return split_text

def remove_nontranscript_phrases(text_split, non_transcript_phrases):
    # remove custom nontranscript related phrases - like  'Deadwood Lovers T-Shirt' - from text split
    text_split_pure = []
    for phrase in text_split:
        if phrase not in non_transcript_phrases:
            text_split_pure.append(phrase)

    return text_split_pure


def remove_parens(text_split):
    # loop over text_split and remove substrings with ()
    text_split_no_parens = []
    for phrase in text_split:

        # find and remove substring between parens (substring)
        open_parens_ind = phrase.find('(')
        closed_parens_ind = phrase.find(')')

        # recursively all parens pairs
        while open_parens_ind >= 0 and closed_parens_ind >= 0:
            # if found, remove
            if open_parens_ind >= 0 and closed_parens_ind > 0:
                if open_parens_ind < closed_parens_ind:
                    phrase = phrase[:open_parens_ind] + phrase[closed_parens_ind+1:].strip()

            # find and remove substring between parens (substring)
            open_parens_ind = phrase.find('(')
            closed_parens_ind = phrase.find(')')

        # append to no parens list
        if len(phrase) > 0:
            text_split_no_parens.append(phrase)

    return text_split_no_parens



def starts_with_non_space_substring(s):
    pattern = r"^\S+:"
    return bool(re.match(pattern, s))

def adjust_colon_spacing(s):
    pattern = r"^(\S+):(\S)"
    return re.sub(pattern, r"\1: \2", s)

def remove_noname_lines(split_text):
    # remove any lines spoken by a non-named character
    named_lines = []
    for phrase in split_text:
        if starts_with_non_space_substring(phrase.strip()):
            # if is a named char, make sure space exists after ':'
            named_lines.append(adjust_colon_spacing(phrase.strip()))
    return named_lines


def separate_into_named_lines(split_text):
    named_lines = []
    line_count = 1
    for phrase in split_text:
        before, sep, after = phrase.partition(":")
        named_line = {}
        named_line["character"] = before.strip()
        named_line["line"] = after.strip()
        named_line["line_count"] = line_count
        line_count += 1
        named_lines.append(named_line)
    return named_lines

import json

def save_as_jsonl(data, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for item in data:
            json.dump(item, f)
            f.write("\n")

# split text on newline symbols
split_text = markdown_text.split("\n")

# remove empty lines
split_text = [v for v in split_text if len(v) > 1]

# remove audio player lines
split_text = remove_audio_lines(split_text)

# remove non-transcript phrases from text split
non_transcript_phrases = ["Deadwood Lovers T-Shirt"]
split_text = remove_nontranscript_phrases(split_text, non_transcript_phrases)

# remove substrings containing ()
split_text = remove_parens(split_text)

# remove no-named lines from script
split_text = remove_noname_lines(split_text)

# split into list of dictionaries, named lines with character / line
# note: this should fail for lines not properly processed above
split_text = separate_into_named_lines(split_text)

# save final result as jsonl
save_as_jsonl(split_text, "season_1_episode_1_cleaned_transcript.jsonl")


In [65]:
split_text[20:30]

[{'character': 'Seth', 'line': "Does it involve lettin' you go?"},
 {'character': 'Clell',
  'line': "I know two scores, Mr. Bullock, that we could make in transit without movin' 20 feet off our path. People with cash on hand. And if once we hit Deadwood and you didn't want to have anything to do with me, we'd never speak again. We would meet as strangers the rest of our fuckin' lives. Now, you tell me what you think of that, sir."},
 {'character': 'Seth', 'line': "It don't appeal to me."},
 {'character': 'Clell',
  'line': 'Get the fuck out of here for a moment would you, sir?'},
 {'character': 'Sol', 'line': "Byron Samson's comin' for him."},
 {'character': 'Clell',
  'line': "Sir, would you please get the fuck out of here 'til we have finished our previous conversation?"},
 {'character': 'Seth', 'line': 'How many in his play?'},
 {'character': 'Sol',
  'line': "A dozen, shit faced. Samson just caved in Tommy Raymond's head over at the no-name frog. He went against it."},
 {'characte

In [40]:
e = "Clell:God? Well if he ain't".strip()
print(e)

Clell:God? Well if he ain't


(Seth nods, and takes his cup over to the wood stove to pour himself some coffee. He's wearing a sling to support his right arm.)
0 128


In [24]:
"hi there".find('(')

-1