## Explore Pipeline for Extracting Sentences by Character from BookNLP

In [1]:
import csv
import re

import pandas as pd
from nltk import pos_tag

### Functions

In [2]:
def read_story_txt_file(story_txt_file):
    with open(story_txt_file) as txt_file:
        story_text = txt_file.read()
        
    return story_text

In [3]:
from nltk.tokenize import sent_tokenize

def split_sentences(story_txt: str):
    '''Split sentences by period and semi-colon, accounting for dialogue.'''
    sentences = sent_tokenize(story_txt)
    new_sentences = []

    for sentence in sentences:
        if ';' in sentence:
            clauses = sentence.split(';')
            new_clauses = []
            for clause in clauses:
                # Do we take out the space after the semi-colon?
                if clause[-1].isalpha():
                    new_clause = clause + ';' 
                    new_clauses.append(new_clause)
                else:
                    new_clause = clause
                    new_clauses.append(new_clause)
            new_sentences += new_clauses
        else:
            new_sentences.append(sentence)
            
    return new_sentences

### Read in Data

In [4]:
story_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/story_txts/bamboo-cutter-moon-child.txt'

In [5]:
story_text_txt = read_story_txt_file(story_path)

In [6]:
tokens_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/booknlp_GX/bamboo-cutter-moon-child/bamboo-cutter-moon-child.tokens'
entities_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/booknlp_GX/bamboo-cutter-moon-child/bamboo-cutter-moon-child.entities'

In [7]:
entities_df = pd.read_csv(entities_path, sep='\t')
tokens_df = pd.read_csv(tokens_path, delimiter = '\t')

In [8]:
entities_df.head()

Unnamed: 0,COREF,start_token,end_token,prop,cat,text
0,17,7,12,NOM,PER,an old bamboo wood - cutter
1,17,14,14,PRON,PER,He
2,18,23,24,NOM,PER,no child
3,10,26,26,PROP,PER,Heaven
4,17,30,30,PRON,PER,his


In [9]:
len(story_text_txt)

30219

In [10]:
def get_coref_unique_character_name(corefs):
    coref_character_dicts = []
    
    for coref in corefs:
        coref_character_dict = {}

        coref_df = characters_df[characters_df['COREF'] == coref]

        proper_names = coref_df[coref_df['prop'] == 'PROP']['text'].unique().tolist()
        final_name = ""

        # Get characters with no proper name
        if len(proper_names) == 0:
            common_names = coref_df[coref_df['prop'] == 'NOM']['text'].unique().tolist()
            if len(common_names) > 1:
                # Check if just different cases
                common_name = common_names[0].lower()
                same_counts = 0
                for name in common_names:
                    if name.lower() == common_names[0].lower():
                        same_counts += 1
                if same_counts == len(common_names):
                    final_name = common_names[0].lower()
            if len(common_names) == 1:
                final_name = common_names[0]
        # Reconcile characters with multiple proper names
        elif len(proper_names) > 1:
            proper_name_elements = []
            for name in proper_names:
                name_elements = name.split(" ")
                upper_count = 0
                for name_element in name_elements:
                    if name_element[0].isupper():
                        upper_count += 1
                if upper_count == len(name_elements):
                    proper_name_elements.append(name_elements)
            if len(proper_name_elements) > 1:
                # If a name is a subset of a full name, use full name
                sorted_by_len = list(sorted(proper_name_elements, key = len, reverse = True))
                longest_element = " ".join(sorted_by_len[0])
                for name_elements in proper_name_elements[1:]:
                    if " ".join(name_elements) in longest_element:
                        final_name = longest_element
            else:
                final_name = proper_name_elements[0][0]
        elif len(proper_names) == 1:
            final_name = proper_names[0]
            
        coref_character_dict['coref'] = coref
        coref_character_dict['name'] = final_name
        
        coref_character_dicts.append(coref_character_dict)
        
    return coref_character_dicts

In [11]:
def get_coref_character_names(corefs, characters_df):
    coref_character_dict = {}
    
    for coref in corefs:        
        all_names = ""

        coref_df = characters_df[characters_df['COREF'] == coref]

        proper_names = coref_df[coref_df['prop'] == 'PROP']['text'].unique().tolist()

        # Get characters with no proper name
        if len(proper_names) == 0:
            common_names = coref_df[coref_df['prop'] == 'NOM']['text'].unique().tolist()
            if len(common_names) > 1:
                # Check if just different cases
                common_name = common_names[0].lower()
                same_counts = 0
                for name in common_names:
                    if name.lower() == common_names[0].lower():
                        same_counts += 1
                if same_counts == len(common_names):
                    all_names = common_names[0].lower()
                else:
                    all_names = "/".join(common_names)
            if len(common_names) == 1:
                all_names = common_names[0]
        else:
            all_names = "/".join(proper_names)
                    
        if all_names not in ['', 'anyone', 'everyone', 'no one']:
            coref_character_dict[coref] = all_names
        
    return coref_character_dict

In [12]:
def get_start_end_bytes_of_sentences(text):
    sentences = split_sentences(text)
    
    while ('' in sentences):
        sentences.remove('')
                
    start_end_sentences = []
    for sentence in sentences:
        sentence_start = text.find(sentence)
        sentence_end = sentence_start + len(sentence) + 1
        start_end_sentences.append([text[sentence_start:sentence_end], sentence_start, sentence_end])
        
    return start_end_sentences

In [13]:
def get_start_end_bytes_of_coref(coref, corefs_df, tokens_df):
    start_end_bytes_of_corefs = []
    char_df = corefs_df[corefs_df['COREF'] == coref]
    display(char_df)
    for i, row in char_df.iterrows():
        token_range = list(range(row['start_token'], row['end_token'] + 1))
        char_tokens_df = tokens_df[tokens_df['token_ID_within_document'].isin(token_range)]
        start_byte = char_tokens_df['byte_onset'].tolist()[0]
        end_byte = char_tokens_df['byte_offset'].tolist()[-1]
        start_end_bytes_of_corefs.append([int(start_byte), int(end_byte), row['text']])
    return start_end_bytes_of_corefs

In [14]:
def get_sentences_of_character(coref, corefs_df, tokens_df, ses_df):
    coref_sentences = []
    start_end_coref = get_start_end_bytes_of_coref(coref, corefs_df, tokens_df)
    
    for (coref_start, coref_end) in start_end_coref:
        ses_char_df = ses_df[(coref_start >= ses_df['start']) & (coref_end <= ses_df['end'])]
        sentence = ses_char_df['text'].values[0]
        if sentence not in coref_sentences:
            coref_sentences.append(sentence)

    return coref_sentences

In [15]:
entities_df = pd.read_csv(entities_path, sep='\t')
tokens_df = pd.read_csv(tokens_path, delimiter = '\t')

characters_df = entities_df[entities_df['cat'] == 'PER']

# All corefs
corefs = characters_df['COREF'].unique()

# All valid corefs (no corefs with only pronouns or possessives)
coref2character = get_coref_character_names(corefs, characters_df) 
corefs = coref2character.keys()

corefs_df = characters_df[characters_df['COREF'].isin(corefs)][['COREF', 'start_token', 'end_token', 'text']].sort_values(by = 'COREF')

In [16]:
start_end_sentences = get_start_end_bytes_of_sentences(story_text_txt)
ses_df = pd.DataFrame(start_end_sentences).reset_index().rename(columns = {'index': 'sentence_id', 0: 'text', 1: 'start', 2: 'end'})

sentences_by_character = {}

In [17]:
print(len(ses_df))

242


In [18]:
story_text_txt[236:355]

'Every morning he went forth into the woods and hills wherever the bamboo reared its lithe green plumes against the sky.'

In [19]:
story_text_txt[30119:30220]

'So to this day people say there is smoke to be seen rising from the top of Mount Fuji to the clouds.'

In [20]:
corefs_df['COREF'].unique()

array([ 10,  11,  12,  13,  14,  15,  16,  17,  18,  25,  27,  28,  29,
        31,  32,  33,  34,  35,  36,  39,  40,  41,  42,  43,  44,  47,
        49,  50,  51,  53,  54,  55,  56,  58,  59,  62,  63,  64,  65,
        73,  74,  76,  77,  79,  80,  85,  86,  88,  89,  91,  94,  98,
       100, 101, 102, 103, 106, 109, 112, 113, 115, 117, 120, 121, 122,
       124, 128, 134, 137, 141, 144, 147, 150, 152, 154, 160, 164, 167,
       170, 174, 176, 177, 179, 185, 186, 187, 196, 198, 200, 201, 203,
       205, 208, 210, 212, 214, 219, 220, 222, 225, 229, 230, 231, 232,
       233, 236, 237, 238, 240, 254, 255, 256, 257, 262, 264, 269, 270,
       272, 274, 275, 283, 285, 289, 291, 297, 299, 300, 306, 308, 310,
       319, 320, 322, 323, 325, 329, 330, 331, 332, 336, 337, 338, 342,
       351, 354, 355, 362, 369, 370, 372, 373, 374, 375, 376, 377, 378,
       379, 381, 382, 388, 389, 402, 403, 404, 407])

In [21]:
corefs_df[corefs_df['COREF'] == 17]

Unnamed: 0,COREF,start_token,end_token,text
0,17,7,12,an old bamboo wood - cutter
21,17,146,146,his
26,17,230,230,he
18,17,126,126,his
25,17,219,219,his
17,17,124,124,he
13,17,86,86,he
11,17,80,80,he
8,17,59,59,he
6,17,47,47,he


In [22]:
start_end_bytes_coref_11 = get_start_end_bytes_of_coref(17, corefs_df, tokens_df)

Unnamed: 0,COREF,start_token,end_token,text
0,17,7,12,an old bamboo wood - cutter
21,17,146,146,his
26,17,230,230,he
18,17,126,126,his
25,17,219,219,his
17,17,124,124,he
13,17,86,86,he
11,17,80,80,he
8,17,59,59,he
6,17,47,47,he


In [23]:
for (start, end, reference) in start_end_bytes_coref_11:
    if (story_text_txt[start:end] != reference):
        print(start, end, reference, story_text_txt[start:end])

28 53 an old bamboo wood - cutter an old bamboo wood-cutter


In [24]:
def get_start_end_bytes_of_coref(coref, corefs_df, tokens_df, story_text):
    start_end_bytes_of_corefs = []
    char_df = corefs_df[corefs_df['COREF'] == coref]
    for i, row in char_df.iterrows():
        token_range = list(range(row['start_token'], row['end_token'] + 1))
        char_tokens_df = tokens_df[tokens_df['token_ID_within_document'].isin(token_range)]
        start_byte = int(char_tokens_df['byte_onset'].tolist()[0])
        end_byte = int(char_tokens_df['byte_offset'].tolist()[-1])
        if (story_text[start_byte:end_byte] != row['text']):
            start_end_bytes_of_corefs.append([start_byte, end_byte, story_text[start_byte:end_byte]]) 
        else:
            start_end_bytes_of_corefs.append([start_byte, end_byte, row['text']])
    return start_end_bytes_of_corefs

In [25]:
get_start_end_bytes_of_coref(17, corefs_df, tokens_df, story_text_txt)

[[28, 53, 'an old bamboo wood-cutter'],
 [686, 689, 'his'],
 [1079, 1081, 'he'],
 [590, 593, 'his'],
 [1025, 1028, 'his'],
 [583, 585, 'he'],
 [385, 387, 'he'],
 [361, 363, 'he'],
 [250, 252, 'he'],
 [195, 197, 'he'],
 [144, 147, 'his'],
 [373, 376, 'his'],
 [667, 669, 'he'],
 [55, 57, 'He'],
 [930, 932, 'he'],
 [124, 127, 'his']]

In [25]:
story_text_txt[28:53]

'an old bamboo wood-cutter'

In [59]:
def get_start_end_bytes_in_sentences(coref, coref2character, corefs_df, tokens_df, ses_df, text):
    start_end_coref = get_start_end_bytes_of_coref(coref, corefs_df, tokens_df, text)
    print(len(start_end_coref))
    
    start_end_bytes_in_sentence = []
    
    for (coref_start, coref_end, character_text) in start_end_coref:
        ses_char_df = ses_df[(coref_start >= ses_df['start']) & (coref_end <= ses_df['end'])]
        if len(ses_char_df) > 0:
            sentence_id = ses_char_df['sentence_id'].values[0]
            start_sentence = ses_char_df['start'].values[0]
            end_sentence = ses_char_df['end'].values[0]
            start_coref_in_sentence = coref_start - start_sentence
            end_coref_in_sentence = start_coref_in_sentence + len(character_text)
            character_text_in_sentence = ses_char_df['text'].values[0][start_coref_in_sentence:end_coref_in_sentence]
            start_end_bytes_in_sentence.append([coref, coref2character[coref], character_text, sentence_id, start_coref_in_sentence, end_coref_in_sentence])
    return start_end_bytes_in_sentence

In [27]:
def get_sentences_by_characters_table(corefs, coref2character, corefs_df, tokens_df, text):
    start_end_sentences = get_start_end_bytes_of_sentences(text)
    ses_df = pd.DataFrame(start_end_sentences).reset_index().rename(columns = {'index': 'sentence_id', 0: 'text', 1: 'start', 2: 'end'})
    
    sentences_by_character = []
    
    for coref in corefs:
        sentences_by_character += get_start_end_bytes_in_sentences(coref, coref2character, corefs_df, tokens_df, ses_df, text)
        
    return sentences_by_character

In [28]:
test = get_sentences_by_characters_table(corefs, coref2character, corefs_df, tokens_df, story_text_txt)

In [29]:
sentences_by_character_df = pd.DataFrame(test).rename(columns = {0: 'coref_id', 1: 'character_name', 2: 'character_token', 3: 'sentence_id', 4: 'start_byte_in_sentence', 5: 'end_byte_in_sentence'})

In [30]:
sentences_by_character_df

Unnamed: 0,coref_id,character_name,character_token,sentence_id,start_byte_in_sentence,end_byte_in_sentence
0,17,an old bamboo wood - cutter,an old bamboo wood-cutter,0,28,53
1,17,an old bamboo wood - cutter,his,4,40,43
2,17,an old bamboo wood - cutter,he,8,19,21
3,17,an old bamboo wood - cutter,his,3,234,237
4,17,an old bamboo wood - cutter,his,7,37,40
...,...,...,...,...,...,...
884,354,the messengers,the messengers,235,30,44
885,355,charioteers,charioteers,235,49,60
886,362,the receding Princess,the receding Princess,237,104,125
887,369,the Royal emissaries,the Royal emissaries,240,22,42


In [31]:
for i, row in sentences_by_character_df.iterrows():
    if row['character_token'] == 'his old wife':
        print(row)

coref_id                            25
character_name            his old wife
character_token           his old wife
sentence_id                          3
start_byte_in_sentence             234
end_byte_in_sentence               246
Name: 18, dtype: object


In [32]:
sentences_by_character_df[sentences_by_character_df['sentence_id'] == 3]

Unnamed: 0,coref_id,character_name,character_token,sentence_id,start_byte_in_sentence,end_byte_in_sentence
3,17,an old bamboo wood - cutter,his,3,234,237
5,17,an old bamboo wood - cutter,he,3,227,229
6,17,an old bamboo wood - cutter,he,3,29,31
7,17,an old bamboo wood - cutter,he,3,5,7
11,17,an old bamboo wood - cutter,his,3,17,20
18,25,his old wife,his old wife,3,234,246


In [33]:
def get_pos_from_character_tokens(character_tokens):
    pos = pos_tag(character_tokens.split(' '))
    if len(pos) == 1:
        if (pos[0][1] == 'PRP$') or (pos[0][0][-2:] == "'s") or (pos[0][0][-2:] == "'"):
            return 'possessive'
        else:
            return 'noun'
    else:
        return 'noun'

In [34]:
get_pos_from_character_tokens("his old wife")

'noun'

In [35]:
get_pos_from_character_tokens("Paulina's")

'possessive'

In [36]:
get_pos_from_character_tokens("his")

'possessive'

In [37]:
pos_tag("he killed his family".split(' '))

[('he', 'PRP'), ('killed', 'VBD'), ('his', 'PRP$'), ('family', 'NN')]

In [38]:
pos_tag("he killed her".split(' '))

[('he', 'PRP'), ('killed', 'VBD'), ('her', 'PRP')]

In [39]:
pos_tag("he killed her family".split(' '))

[('he', 'PRP'), ('killed', 'VBD'), ('her', 'PRP'), ('family', 'NN')]

In [40]:
def check_overlapping_spans(row, spans):
    curr_span = (row['start_byte_in_sentence'], row['end_byte_in_sentence'])
    spans_c = spans.copy()
    spans_c.remove(curr_span)
    for span in spans_c:
        if (curr_span[0] >= span[0]) and (curr_span[1] <= span[1]):
            return 1
    return 0

In [41]:
def check_if_coref_possessive(sentence_character_df: pd.DataFrame):
    if len(sentence_character_df) > 0:
        byte_start_ends_dict = sentence_character_df[['start_byte_in_sentence', 'end_byte_in_sentence']].to_dict('records')
        byte_start_ends = [(row['start_byte_in_sentence'], row['end_byte_in_sentence']) for row in byte_start_ends_dict]
        return sentence_character_df.apply(check_overlapping_spans, spans = byte_start_ends, axis = 1) 

In [42]:
test = sentences_by_character_df[sentences_by_character_df['sentence_id'] == 3]

In [43]:
check_if_coref_possessive(test)

3     1
5     0
6     0
7     0
11    0
18    0
dtype: int64

In [44]:
series = check_if_coref_possessive(test)

In [45]:
test = sentences_by_character_df[sentences_by_character_df['sentence_id'] == 4]

In [46]:
series_2 = check_if_coref_possessive(test)

In [47]:
pd.concat([series, series_2])

3     1
5     0
6     0
7     0
11    0
18    0
1     0
12    0
dtype: int64

In [48]:
sentences_by_character_df.loc[11, :]

coref_id                                           17
character_name            an old bamboo wood - cutter
character_token                                   his
sentence_id                                         3
start_byte_in_sentence                             17
end_byte_in_sentence                               20
Name: 11, dtype: object

In [49]:
sentences_by_character_df.iloc[855]

coref_id                                    306
character_name            two thousand warriors
character_token           two thousand warriors
sentence_id                                 199
start_byte_in_sentence                       63
end_byte_in_sentence                         84
Name: 855, dtype: object

In [50]:
sentences = split_sentences(story_text_txt)

In [51]:
sentences[241]

'So to this day people say there is smoke to be seen rising from the top of Mount Fuji to the clouds.'

In [52]:
sentences[241][15:21]

'people'

In [53]:
sentences_by_character_df.iloc[100]

coref_id                           29
character_name            the old man
character_token                   His
sentence_id                       239
start_byte_in_sentence              0
end_byte_in_sentence                3
Name: 100, dtype: object

In [54]:
sentences[221]

'"I pray you to look elsewhere."'

In [55]:
sentences_by_character_df[sentences_by_character_df['sentence_id'] == 0]

Unnamed: 0,coref_id,character_name,character_token,sentence_id,start_byte_in_sentence,end_byte_in_sentence
0,17,an old bamboo wood - cutter,an old bamboo wood-cutter,0,28,53


In [60]:
def bookNLP_to_sentences_csv(entities_file, tokens_file, story_txt_file, output_sentences_file, output_sentences_by_characters_file):
    entities_df = pd.read_csv(entities_file, sep='\t')
    tokens_df = pd.read_csv(tokens_file, delimiter = '\t')
    story_text = read_story_txt_file(story_txt_file)
    
    characters_df = entities_df[entities_df['cat'] == 'PER']
    
    # All corefs
    corefs = characters_df['COREF'].unique()
    
    # All valid corefs (no corefs with only pronouns or possessives)
    coref2character = get_coref_character_names(corefs, characters_df) 
    corefs = coref2character.keys()
        
    corefs_df = characters_df[characters_df['COREF'].isin(corefs)][['COREF', 'start_token', 'end_token', 'text']].sort_values(by = 'COREF')
    
    start_end_sentences = get_start_end_bytes_of_sentences(story_text)
    sentences_df = pd.DataFrame(start_end_sentences).reset_index().rename(columns = {'index': 'sentence_id', 0: 'text', 1: 'start', 2: 'end'})

    sentences_by_character = get_sentences_by_characters_table(corefs, coref2character, corefs_df, tokens_df, story_text)
    
    sentences_by_character_df = pd.DataFrame(sentences_by_character).rename(columns = {0: 'coref_id', 
                                                                     1: 'character_name', 
                                                                     2: 'character_token', 
                                                                     3: 'sentence_id', 
                                                                     4: 'start_byte_in_sentence', 
                                                                     5: 'end_byte_in_sentence'})
    
    overlaps_series_all = []
    for i in range(len(sentences)):
        sentence_by_character_df = sentences_by_character_df.loc[sentences_by_character_df['sentence_id'] == i]
        overlap_series = check_if_coref_possessive(sentence_by_character_df)
        overlaps_series_all.append(overlap_series)
                
    sentences_by_character_df = sentences_by_character_df.merge(pd.concat(overlaps_series_all).rename('overlap'), left_index=True, right_index=True)

    # sentences_df.to_csv(output_sentences_file, index = False)
    print("CSV of sentences saved to path: {}".format(output_sentences_file))
    # sentences_by_character_df.to_csv(output_sentences_by_characters_file, index = False)
    print("CSV of sentences by character saved to path: {}".format(output_sentences_by_characters_file))

### Running

In [57]:
story_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/story_txts/bamboo-cutter-moon-child.txt'
tokens_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/booknlp_GX/bamboo-cutter-moon-child/bamboo-cutter-moon-child.tokens'
entities_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/booknlp_GX/bamboo-cutter-moon-child/bamboo-cutter-moon-child.entities'
output_characters_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences_by_character/bamboo-cutter-moon-child.sentences_characters.csv'
output_sentences_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/bamboo-cutter-moon-child.sentences.csv'

In [61]:
bookNLP_to_sentences_csv(entities_path, tokens_path, story_path, output_sentences_path, output_characters_path)

165
889
16
1
1
1
1
3
105
1
4
3
1
10
1
1
1
1
13
1
5
1
1
2
1
1
1
99
1
1
1
1
1
154
1
1
10
1
1
1
2
1
14
2
1
7
1
22
1
4
13
1
2
1
2
1
1
1
1
2
3
1
1
1
1
2
1
1
4
1
4
2
2
2
19
1
1
10
3
1
64
3
9
1
1
1
1
32
1
1
2
1
5
1
1
8
1
3
4
1
2
1
1
1
4
1
1
1
2
6
1
1
2
4
1
3
52
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
3
1
1
8
1
3
2
1
1
2
1
1
1
1
3
1
1
1
1
5
1
1
1
1
2
1
1
2
1
1
1
1
1
1
1
CSV of sentences saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/bamboo-cutter-moon-child.sentences.csv
CSV of sentences by character saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences_by_character/bamboo-cutter-moon-child.sentences_characters.csv


#### Testing Subset

In [59]:
book_ids = ['cinderella-or-the-little-glass-slipper', 'ali-baba-and-forty-thieves', 'old-dschang', 'leelinau-the-lost-daughter', 'the-dragon-princess']

In [61]:
for book_id in book_ids:
    story_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/story_txts/' + book_id + '.txt'
    
    input_dir = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/testing_subset/booknlp_GX/' + book_id + '/'
    tokens_path = input_dir + book_id + '.tokens'
    entities_path = input_dir + book_id + '.entities'
    
    output_characters_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences_by_character/' + book_id + '.sentences_characters.csv'
    output_sentences_path = '/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/' + book_id + '.sentences.csv'
    
    bookNLP_to_sentences_csv(entities_path, tokens_path, story_path, output_sentences_path, output_characters_path)

CSV of sentences saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/cinderella-or-the-little-glass-slipper.sentences.csv
CSV of sentences by character saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences_by_character/cinderella-or-the-little-glass-slipper.sentences_characters.csv
CSV of sentences saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/ali-baba-and-forty-thieves.sentences.csv
CSV of sentences by character saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences_by_character/ali-baba-and-forty-thieves.sentences_characters.csv
CSV of sentences saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/old-dschang.sentences.csv
CSV of sentences by character saved to path: /Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentenc

In [62]:
sent_char_df = pd.read_csv('/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences_by_character/cinderella-or-the-little-glass-slipper.sentences_characters.csv')

In [63]:
sent_df = pd.read_csv('/Users/pti/challenges/4761_fair_fairytale/fair-fairytale-nlp/data/pipeline/sentences/cinderella-or-the-little-glass-slipper.sentences.csv')

In [64]:
sent_char_df.head()

Unnamed: 0,coref_id,character_name,character_token,sentence_id,start_byte_in_sentence,end_byte_in_sentence,overlap
0,3,"a gentleman who married , for his second wife",He,2,0,2,0
1,3,"a gentleman who married , for his second wife",his,0,44,47,1
2,3,"a gentleman who married , for his second wife","a gentleman who married, for his second wife",0,15,59,0
3,4,his second wife,his second wife,0,44,59,1
4,4,his second wife,her,2,126,129,1


In [65]:
sent_char_df.shape

(460, 7)

In [66]:
sent_char_df.iloc[442]

coref_id                                            104
character_name            The guards at the palace gate
character_token                                    they
sentence_id                                         107
start_byte_in_sentence                               45
end_byte_in_sentence                                 49
overlap                                               0
Name: 442, dtype: object

In [67]:
sentences = sent_df['text'].tolist()

In [68]:
sentences[127]

'Cinderella, who was no less good than beautiful, gave her two sisters lodgings in the palace, and that very same day matched them with two great lords of the Court.'

In [69]:
sentences[127][135:163]

'two great lords of the Court'

In [70]:
for i, sentence in enumerate(sentences):
    if ';' in sentence:
        print(i, sentence)

5 She employed her in the meanest work of the house: she scoured the dishes, tables, etc., and scrubbed madam's chamber, and those of misses, her daughters; 
7 The poor girl bore all patiently, and dared not tell her father, who would have rattled her off; 
9 When she had done her work, she used to go into the chimney-corner, and sit down among cinders and ashes, which made her commonly be called Cinderwench; 
15 This was a new trouble to Cinderella; 
16  for it was she who ironed her sisters' linen, and plaited their ruffles; 
19 "And I," said the youngest, "shall have my usual petticoat; 
25 said she, "you only jeer me; 
27 "You are in the right of it," replied they; 
31 At last the happy day came; 
34 "I wish I could-I wish I could-";
36 This godmother of hers, who was a fairy, said to her, "You wish you could go to the ball; 
42 Her godmother scooped out all the inside of it, having left nothing but the rind; 
46 "You are in the right," replied her godmother; 
52 The Fairy then sai

In [71]:
sent_char_df[sent_char_df['sentence_id'] == 5]

Unnamed: 0,coref_id,character_name,character_token,sentence_id,start_byte_in_sentence,end_byte_in_sentence,overlap
15,12,the mother - in - law,She,5,0,3,0
16,12,the mother - in - law,her,5,140,143,1
18,12,the mother - in - law,she,5,51,54,0
21,12,the mother - in - law,her,5,13,16,0
28,17,madam,madam,5,102,107,0
29,19,misses,misses,5,132,138,0
30,20,her daughters,her daughters,5,140,153,0


In [72]:
sentences[5][0:3]

'She'

In [73]:
sentences[5][140:153]

'her daughters'

In [74]:
sentences[5][102:107]

'madam'

In [75]:
sent_char_df[sent_char_df['sentence_id'] == 102]

Unnamed: 0,coref_id,character_name,character_token,sentence_id,start_byte_in_sentence,end_byte_in_sentence,overlap
168,1,Cinderella,she,102,9,12,0
169,1,Cinderella,she,102,62,65,0


In [76]:
sentences[102][9:12]

'she'

In [77]:
sentences[102][62:65]

'she'