In [1]:
import requests
import xml.etree.ElementTree as ET
import re


In [2]:
def find_character_name(speaker_name):
    names = []
    speaker_name = re.sub( r'\.$' , '' , speaker_name )
    if re.search( r'\s+and\s+' , speaker_name):
        parts = re.split( r'\s+and\s+' , speaker_name )
        names.append(parts[0])
        names.append(parts[1])
    else:
        names.append(speaker_name)
    return names

In [3]:
def download_tei(url):
    response = requests.get(url)
    if response:
        xml_file = response.text
        xml_file = re.sub( r'&[mn]dash;' , '&#x2014;' , xml_file )
        xml_file = re.sub( r'&auml;' , '&#xC4;' , xml_file )
        xml_file = re.sub( r'&eacute;' , '&#xE9;' , xml_file )
        xml_file = re.sub( r'&aelig;' , 'ae' , xml_file )
        xml_file = re.sub( r'&hellip;' , '' , xml_file )
        xml_file = re.sub( r'&pound;' , '' , xml_file )
        xml_file = re.sub( r'&agrave;' , '' , xml_file )
        xml_file = re.sub( r'&egrave;' , '' , xml_file )
        


    with open('tei.xml','w',encoding='utf-8') as f:
        f.write(xml_file)

def dramatis_personae(url):
    download_tei(url)
    tree = ET.parse('tei.xml')
    root = tree.getroot()
    dramatis_personae = dict()

    count_sp = 0
    for speaker in root.findall('.//speaker'):
        count_sp += 1
        names = find_character_name(speaker.text)
        for name in names:
            dramatis_personae[name] = dramatis_personae.get(name,0)+1
    return list(dramatis_personae.keys())


In [4]:
def get_texts(title,url,male,female):
    download_tei(url)
    lines = dict()
    tree = ET.parse('tei.xml')
    root = tree.getroot()
    for sp in root.findall('.//sp'):
        for child in sp:
            if child.tag == 'speaker':
                name = find_character_name(child.text)[0]

            if child.tag == 'p':
                text = re.sub( r'\s+' , ' ' , str(child.text) )
                text = re.sub( r'—' , ' - ' , str(text) )
                
                if str(text) != 'None':
                    if name in male:
                        lines['Male'] = lines.get('Male','') + f'{text} \n'
                    else:
                        lines['Female'] = lines.get('Female','') + f'{text} \n'
    out = open( f'{title}_male.txt','w',encoding='utf-8')
    out.write(lines['Male'])
    out.close()

    out = open( f'{title}_female.txt','w',encoding='utf-8')
    out.write(lines['Female'])
    out.close()


## Importance of Being Earnest

In [5]:
url = 'http://www.ucc.ie/celt/texts/E850003-002.xml'

names = dramatis_personae(url)
print(names)        

['Algernon', 'Lane', 'Jack', 'Lady Bracknell', 'Gwendolen', 'Miss Prism', 'Cecily', 'Chasuble', 'Merriman']


In [6]:
male = ['Algernon', 'Lane', 'Jack', 'Chasuble', 'Merriman']
female = [ 'Lady Bracknell', 'Gwendolen', 'Miss Prism', 'Cecily']
title = 'TheImportanceOfBeingEarnest'
get_texts(title,url,male,female)

## Lady Windermere's Fan

In [7]:
url = 'https://celt.ucc.ie/texts/E850003-105.xml'

names = dramatis_personae(url)
print(names)  

['PARKER:', 'LADY WINDERMERE:', 'LORD DARLINGTON:', 'LADY  WINDERMERE:', 'DUCHESS OF BERWICK:', 'LORD  DARLINGTON:', 'LADY AGATHA:', 'LORD WINDERMERE:', 'DUMBY:', 'LADY STUTFIELD:', 'MRS COWPER-COWPER:', 'HOPPER:', 'LORD AUGUSTUS:', 'CECIL GRAHAM:', 'LADY PLYMDALE:', 'MRS ERLYNNE:', 'LADY JEDBURGH:', 'MRS. ERLYNNE:', 'MRS.  ERLYNNE:', 'ROSALIE:']


In [8]:
male = ['PARKER:', 'LORD DARLINGTON:', 'LORD  DARLINGTON:',  'LORD WINDERMERE:', 'DUMBY:', 'HOPPER:', 'LORD AUGUSTUS:' , 'CECIL GRAHAM:']
female = ['LADY WINDERMERE:',  'LADY  WINDERMERE:', 'DUCHESS OF BERWICK:',  'LADY AGATHA:',   'LADY STUTFIELD:', 'MRS COWPER-COWPER:',  'LADY PLYMDALE:', 'MRS ERLYNNE:', 'LADY JEDBURGH:', 'MRS. ERLYNNE:', 'MRS.  ERLYNNE:', 'ROSALIE:']
title = "LadyWindermeresFan"
get_texts(title,url,male,female)

## An Ideal Husband

In [9]:
url = 'https://celt.ucc.ie/texts/E850003-108.xml'

names = dramatis_personae(url)
print(names)  

['MRS. MARCHMONT:', 'LADY BASILDON:', 'LADY  BASILDON:', 'MRS.  MARCHMONT:', 'MASON:', 'LORD CAVERSHAM:', 'LADY CHILTERN:', 'MABEL CHILTERN:', 'LADY MARKBY:', 'MRS. CHEVELEY:', 'VICOMTE DE NANJAC:', 'SIR ROBERT CHILTERN:', 'LORD GORING:', 'MR. MONTFORD:', 'PHIPPS:', 'LORD  GORING:', 'HAROLD:', 'JAMES:']


In [10]:
male = [ 'MASON:', 'LORD CAVERSHAM:',  'VICOMTE DE NANJAC:', 'SIR ROBERT CHILTERN:', 'LORD GORING:', 'MR. MONTFORD:', 'PHIPPS:', 'LORD  GORING:', 'HAROLD:', 'JAMES:']
female = ['MRS. MARCHMONT:', 'LADY BASILDON:', 'LADY  BASILDON:', 'MRS. CHEVELEY:','MRS.  MARCHMONT:','LADY CHILTERN:', 'MABEL CHILTERN:', 'LADY MARKBY:' ]
title = "AnIdealHusband"
get_texts(title,url,male,female)