# Match quotes from original tokens file to whitespace-tokenized

In [89]:
# Load token files
import pandas as pd
import csv

def load_tokens_file(token_fpath):                                                   
    """ Load a BookNLP tokens file, return a pandas DataFrame """                    
    return pd.read_csv(token_fpath, sep='\t', quoting=csv.QUOTE_NONE)       

original_fpath = '/usr0/home/mamille2/book-nlp/data/tokens/annotated_10fandom_dev_original_tokenization/teenwolf_1145590.tokens'
whitespace_fpath = '/usr0/home/mamille2/book-nlp-whitespace-tok/data/tokens/annotated_10fandom_dev/teenwolf_1145590.tokens'
original_data = load_tokens_file(original_fpath)
whitespace_data = load_tokens_file(whitespace_fpath)
print(len(original_data))
print(len(whitespace_data))

3496
3493


In [77]:
pd.set_option('display.max_rows', None)
selected_columns = ['tokenId', 'originalWord', 'normalizedWord', 'lemma']
whitespace_data[selected_columns]

Unnamed: 0,tokenId,originalWord,normalizedWord,lemma
0,0,Two,Two,two
1,1,weeks,weeks,week
2,2,.,.,.
3,3,Two,Two,two
4,4,weeks,weeks,week
5,5,of,of,of
6,6,chasing,chasing,chase
7,7,a,a,a
8,8,HYDRA,HYDRA,hydra
9,9,weapons,weapons,weapon


In [78]:
original_data[selected_columns]

Unnamed: 0,tokenId,originalWord,normalizedWord,lemma
0,0,Two,Two,two
1,1,weeks,weeks,week
2,2,.,.,.
3,3,Two,Two,two
4,4,weeks,weeks,week
5,5,of,of,of
6,6,chasing,chasing,chase
7,7,a,a,a
8,8,HYDRA,HYDRA,hydra
9,9,weapons,weapons,weapon


In [91]:
# Change over quote tokens (using the next token)
import pdb

new_tokens = []
offset = 0 # how many tokens whitespace appears to be off from original
desired_quote_chars = ['``', '`', "''", "'"]
all_quote_chars = ['“', '``', '"', '«', '”', "''", '"', '»', "'"]
# transformations = {'-RRB-': ')',
#                    '-LRB_': '(',
#                    '?-': '-'
#                   }
# transformations_quotes = {**transformations, **{
#                     '-RRB-': ')',
#                    '-LRB_': '(',
#                    '-': '?-',
#                    "``": '"',
#                     "``": '“',
#                    "`": "'",
#                     "''": '"',
#                     "''": '”',
#                   }}
transformations = {')': '-RRB-',
                   '(': '-LRB_',
                   '-': '?-',
                   '…': '...'
                  }
transformations_quotes = {**transformations, **{
                   '"': "``",
                   '“': "``",
                   "'": "`",
                   '"': "''",
                   '”': "''",
                  }}

def token_matches(original_tok, whitespace_tok, quotes=False):
    if original_tok == whitespace_tok:
        return True
    if quotes:
        if whitespace_tok in transformations_quotes and transformations_quotes[whitespace_tok] == original_tok:
            return True
        else:
            return False
    else:
        if whitespace_tok in transformations and transformations[whitespace_tok] == original_tok:
            return True
        else:
            return False

for i in range(len(whitespace_data)):
    if i+offset >= len(original_data):
        pdb.set_trace()
    original_tok = original_data.loc[i+offset, 'normalizedWord']
    whitespace_tok = whitespace_data.loc[i, 'normalizedWord']
    tok_to_add = whitespace_tok
    
    if not token_matches(original_tok, whitespace_tok, quotes=False):
        if original_tok in quote_chars:
            # Add the original quote token
            tok_to_add = original_tok
        else:
            next_whitespace_tok = whitespace_data.loc[i+1, 'normalizedWord']
            if next_whitespace_tok == '?-': pdb.set_trace()
            # Find offset using the next non-quote character
            for j in range(1,5):
                next_original_tok = original_data.loc[i+offset+j, 'normalizedWord']
                if token_matches(next_original_tok, next_whitespace_tok, quotes=True):
                    offset += j-1
                    break
            else:
                pdb.set_trace()
    new_tokens.append(tok_to_add)
    
print(len(new_tokens))
print(len(whitespace_data))
# print('“' in whitespace_data['normalizedWord'].tolist())
print('"' in whitespace_data['normalizedWord'].tolist())
print('"' in new_tokens)
# whitespace_data['normalizedWord'] = new_tokens
# whitespace_data['lemma'] = new_tokens

3493
3493
True
False


# Check BookNLP token output

In [1]:
# Load existing tokens file
import pandas as pd

fpath = '/projects/book-nlp/data/tokens/annotated_10fandom_test/allmarvel_606106.tokens'

token_data = pd.read_csv(fpath, sep='\t')
token_data

Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId,supersense
0,0,0,0,0,7,,2,Anthony,Anthony,Anthony,NNP,PERSON,nn,O,-1,B-noun.person
1,0,0,1,8,14,,2,Edward,Edward,Edward,NNP,PERSON,nn,O,-1,I-noun.person
2,0,0,2,15,20,,6,Stark,Stark,Stark,NNP,PERSON,nsubj,O,-1,I-noun.person
3,0,0,3,21,24,,6,was,was,be,VBD,O,cop,O,-1,B-verb.stative
4,0,0,4,25,26,,6,a,a,a,DT,O,det,O,-1,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0,147,4166,19443,19446,,4168,his,his,he,PRP$,O,poss,O,8,O
3146,0,147,4167,19447,19450,,4168,own,own,own,JJ,O,amod,O,-1,O
3147,0,147,4168,19451,19457,,4165,cheeks,cheeks,cheek,NNS,O,pobj,O,-1,B-noun.body
3148,0,147,4169,19458,19459,,4143,.,.,.,.,O,punct,O,-1,O


In [2]:
# Load original tokens file
import pandas as pd

fpath = '/projects/book-nlp/data/tokens/annotated_10fandom_test_old/allmarvel_606106.tokens'

token_data = pd.read_csv(fpath, sep='\t')
token_data

Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId,supersense
0,0,0,0,0,7,S,2,Anthony,Anthony,Anthony,NNP,PERSON,nn,O,-1,B-noun.person
1,0,0,1,8,14,S,2,Edward,Edward,Edward,NNP,PERSON,nn,O,-1,I-noun.person
2,0,0,2,15,20,S,6,Stark,Stark,Stark,NNP,PERSON,nsubj,O,-1,I-noun.person
3,0,0,3,21,24,S,6,was,was,be,VBD,O,cop,O,-1,B-verb.stative
4,0,0,4,25,26,S,6,a,a,a,DT,O,det,O,-1,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3147,60,149,4172,19443,19446,S,4174,his,his,he,PRP$,O,poss,O,8,O
3148,60,149,4173,19447,19450,S,4174,own,own,own,JJ,O,amod,O,-1,O
3149,60,149,4174,19451,19457,S,4171,cheeks,cheeks,cheek,NNS,O,pobj,O,-1,B-noun.body
3150,60,149,4175,19458,19459,NN,4149,.,.,.,.,O,punct,O,-1,O


# Modify tokens file with gold coref annotations

In [4]:
# Load existing tokens file
import pandas as pd

fpath = '/projects/book-nlp/data/tokens/annotated_10fandom_test/allmarvel_606106.tokens'

token_data = pd.read_csv(fpath, sep='\t')
token_data

Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId,supersense
0,0,0,0,0,7,,2,Anthony,Anthony,Anthony,NNP,PERSON,nn,O,-1,B-noun.person
1,0,0,1,8,14,,2,Edward,Edward,Edward,NNP,PERSON,nn,O,-1,I-noun.person
2,0,0,2,15,20,,6,Stark,Stark,Stark,NNP,PERSON,nsubj,O,-1,I-noun.person
3,0,0,3,21,24,,6,was,was,be,VBD,O,cop,O,-1,B-verb.stative
4,0,0,4,25,26,,6,a,a,a,DT,O,det,O,-1,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0,147,4166,19443,19446,,4168,his,his,he,PRP$,O,poss,O,8,O
3146,0,147,4167,19447,19450,,4168,own,own,own,JJ,O,amod,O,-1,O
3147,0,147,4168,19451,19457,,4165,cheeks,cheeks,cheek,NNS,O,pobj,O,-1,B-noun.body
3148,0,147,4169,19458,19459,,4143,.,.,.,.,O,punct,O,-1,O


In [2]:
from collections import namedtuple

Span = namedtuple('Span', ['chapter_id', 'paragraph_id', 'start_token_id', 'end_token_id', 'character'])
test = Span(chapter_id=1, paragraph_id=1, start_token_id=1, end_token_id=3, character='Stark')
test

Span(chapter_id=1, paragraph_id=1, start_token_id=1, end_token_id=3, character='Stark')

In [5]:
for i in range(0,4):
    token_data.loc[(token_data['paragraphId']==0) & (token_data['tokenId']==i), 'characterId'] = 1
    
token_data

Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId,supersense
0,0,0,0,0,7,,2,Anthony,Anthony,Anthony,NNP,PERSON,nn,O,1,B-noun.person
1,0,0,1,8,14,,2,Edward,Edward,Edward,NNP,PERSON,nn,O,1,I-noun.person
2,0,0,2,15,20,,6,Stark,Stark,Stark,NNP,PERSON,nsubj,O,1,I-noun.person
3,0,0,3,21,24,,6,was,was,be,VBD,O,cop,O,1,B-verb.stative
4,0,0,4,25,26,,6,a,a,a,DT,O,det,O,-1,O
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,0,147,4166,19443,19446,,4168,his,his,he,PRP$,O,poss,O,8,O
3146,0,147,4167,19447,19450,,4168,own,own,own,JJ,O,amod,O,-1,O
3147,0,147,4168,19451,19457,,4165,cheeks,cheeks,cheek,NNS,O,pobj,O,-1,B-noun.body
3148,0,147,4169,19458,19459,,4143,.,.,.,.,O,punct,O,-1,O


# Old

In [15]:
import pandas as pd
import csv
import json

In [13]:
data = pd.read_csv('percy_jackson_1.tokens', error_bad_lines=False, sep='\t', engine='python', quoting=csv.QUOTE_NONE)
print(len(data))
print(data.columns)

226481
Index(['paragraphId', 'sentenceID', 'tokenId', 'beginOffset', 'endOffset',
       'whitespaceAfter', 'headTokenId', 'originalWord', 'normalizedWord',
       'lemma', 'pos', 'ner', 'deprel', 'inQuotation', 'characterId',
       'supersense'],
      dtype='object')


In [6]:
data['inQuotation'].count()

110913

In [7]:
data['inQuotation'].head()

0    False
1    False
2    False
3    False
4    False
Name: inQuotation, dtype: bool

In [10]:
pd.set_option('display.max_colwidth', -1)

In [14]:
data[data['inQuotation']==True]

Unnamed: 0,paragraphId,sentenceID,tokenId,beginOffset,endOffset,whitespaceAfter,headTokenId,originalWord,normalizedWord,lemma,pos,ner,deprel,inQuotation,characterId,supersense
178,6,10,178,798,799,,180,"""",``,``,``,O,punct,True,-1,O
179,6,10,179,799,804,S,180,Percy,Percy,Percy,NNP,PERSON,nsubj,True,13,B-noun.person
180,6,10,180,805,807,S,177,go,go,go,VB,O,ccomp,True,-1,B-verb.motion
181,6,10,181,808,812,S,180,away,away,away,RB,O,advmod,True,-1,O
182,6,10,182,813,814,,184,I,I,I,PRP,O,nsubj,True,-1,O
183,6,10,183,814,816,S,184,'m,'m,be,VBP,O,aux,True,-1,O
184,6,10,184,817,822,S,180,tying,tying,tie,VBG,O,ccomp,True,-1,B-verb.contact
185,6,10,185,823,829,S,184,myself,myself,myself,PRP,O,dobj,True,-1,O
186,6,10,186,830,832,S,184,to,to,to,TO,O,prep,True,-1,O
187,6,10,187,833,834,S,188,a,a,a,DT,O,det,True,-1,O


In [16]:
# Load JSON
with open('book.id.book', 'rb') as f:
    metadata = json.load(f)

In [17]:
metadata.keys()

dict_keys(['characters', 'id'])

In [24]:
for i,d in enumerate(metadata['characters']):
    print(i, d['names'])

0 [{'c': 13, 'n': 'Grover'}]
1 [{'c': 2, 'n': 'Walmart'}]
2 [{'c': 3, 'n': 'Mom'}]
3 [{'c': 19, 'n': 'Keily'}]
4 [{'c': 8, 'n': 'Nyssa'}]
5 [{'c': 5, 'n': 'Mr. D'}]
6 [{'c': 885, 'n': 'Annabeth'}, {'c': 10, 'n': 'Annabeth Chase'}, {'c': 8, 'n': 'ANNABETH'}]
7 [{'c': 2, 'n': 'Ms. Tojo'}]
8 [{'c': 672, 'n': 'Jason'}, {'c': 6, 'n': 'Jason Grace'}, {'c': 1, 'n': 'Grace'}]
9 [{'c': 6, 'n': 'Poseidon'}]
10 [{'c': 5, 'n': 'Mavis'}]
11 [{'c': 2, 'n': 'Bunker'}]
12 [{'c': 2, 'n': 'Flowers'}]
13 [{'c': 1295, 'n': 'Percy'}, {'c': 31, 'n': 'Percy Jackson'}, {'c': 9, 'n': 'Jackson'}, {'c': 8, 'n': 'PERCY'}]
14 [{'c': 2, 'n': 'Cameron'}]
15 [{'c': 2, 'n': 'Dr. Phil'}]
16 [{'c': 2, 'n': 'Google'}]
17 [{'c': 24, 'n': 'Thalia'}]
18 [{'c': 13, 'n': 'Connor'}]
19 [{'c': 2, 'n': 'Zeus'}]
20 [{'c': 2, 'n': 'Marina'}]
21 [{'c': 10, 'n': 'Chris'}]
22 [{'c': 15, 'n': 'Annie'}]
23 [{'c': 3, 'n': 'Calypso'}]
24 [{'c': 19, 'n': 'Athena'}]
25 [{'c': 7, 'n': 'Frank'}]
26 [{'c': 3, 'n': 'Gaia'}]
27 [{'c': 5, 'n': '

In [25]:
metadata['characters'][13]['speaking']

[{'i': 288,
  'w': "`` Hey , Hazel , can I borrow your brother for a minute ? '' "},
 {'i': 322,
  'w': "`` Go ahead . Piper and I have been hoping for some girl-time , I 'll go see if she 's free . '' "},
 {'i': 387, 'w': "`` Woah , dude , I just want to talk , '' "},
 {'i': 452, 'w': "`` Okay , '' "},
 {'i': 543,
  'w': "`` I - when Annabeth and I were in Tartarus , I realised - I 've not been a very good friend to you . Ever , really . I pretty much jumped from treating you like a little kid to taking for granted that you could take care of yourself , and - and I ` msorry . '' "},
 {'i': 696,
  'w': "`` You had more important things to do than make sure that I was okay . '' "},
 {'i': 785,
  'w': "`` Are you kidding ? Of course I was worried about you , '' "},
 {'i': 802,
  'w': "`` You 're my friend and I care about you . How could I have not been worried ? '' "},
 {'i': 856, 'w': "`` Okay , that 's - that 's all I had to say , '' "},
 {'i': 880, 'w': "`` Wait , no , I lied . One m

In [26]:
len(metadata['characters'][13]['speaking'])

602