In [1]:
import pandas as pd
import spacy

In [2]:
data = pd.read_csv("../data/FAA_data/Maintenance_Text_data_nona.csv")["c119"]

1. Document ID: This is a variation on the document filename
2. Part number: Some files are divided into multiple parts numbered as 000, 001, 002, ... etc.
3. Word number
4. Word itself: This is the token as segmented/tokenized in the Treebank. Initially the *_skel file contain the placeholder [WORD] which gets replaced by the actual token from the Treebank which is part of the OntoNotes release.
5. Part-of-Speech
6. Parse bit: This is the bracketed structure broken before the first open parenthesis in the parse, and the word/part-of-speech leaf replaced with a *. The full parse can be created by substituting the asterix with the "([pos] [word])" string (or leaf) and concatenating the items in the rows of that column.
7. Predicate lemma: The predicate lemma is mentioned for the rows for which we have semantic role information. All other rows are marked with a "-"
8. Predicate Frameset ID: This is the PropBank frameset ID of the predicate in Column 7.
9. Word sense: This is the word sense of the word in Column 3.
10. Speaker/Author: This is the speaker or author name where available. Mostly in Broadcast Conversation and Web Log data.
11. Named Entities: These columns identifies the spans representing various named entities.

12 - N Predicate Arguments: There is one column each of predicate argument structure information for the predicate mentioned in Column 7.

N. Coreference:oreference chain information encoded in a parenthesis structure.

In [5]:
def get_parse_bit(token):
    if token.dep_ == 'punct':
        return token.text
    elif token.children:
        return f"({token.pos_} {token.text} {' '.join(get_parse_bit(child) for child in token.children)})"
    else:
        return f"({token.pos_} {token.text})"

def convert_to_conll12(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)

    print("\t".join(["# Document ID", "Part number", "Word number", "Word itself", "Part-of-Speech", "Parse bit",
                     "Predicate lemma", "Predicate Frameset ID", "Word sense", "Speaker/Author", "Named Entities",
                     "Predicate Arguments", "Coreference"]))

    for i, token in enumerate(doc):
        parse_bit = get_parse_bit(token)
        row = [str(i + 1), "0", str(i + 1), token.text, token.pos_, parse_bit, "-", "-", "-", "-", "-", "-", "-"]
        print("\t".join(row))


# Example usage
text = "This is an example sentence. Another sentence follows."
convert_to_conll12(text)

# Document ID	Part number	Word number	Word itself	Part-of-Speech	Parse bit	Predicate lemma	Predicate Frameset ID	Word sense	Speaker/Author	Named Entities	Predicate Arguments	Coreference
1	0	1	This	PRON	(PRON This )	-	-	-	-	-	-	-
2	0	2	is	AUX	(AUX is (PRON This ) (NOUN sentence (DET an ) (NOUN example )) .)	-	-	-	-	-	-	-
3	0	3	an	DET	(DET an )	-	-	-	-	-	-	-
4	0	4	example	NOUN	(NOUN example )	-	-	-	-	-	-	-
5	0	5	sentence	NOUN	(NOUN sentence (DET an ) (NOUN example ))	-	-	-	-	-	-	-
6	0	6	.	PUNCT	.	-	-	-	-	-	-	-
7	0	7	Another	DET	(DET Another )	-	-	-	-	-	-	-
8	0	8	sentence	NOUN	(NOUN sentence (DET Another ))	-	-	-	-	-	-	-
9	0	9	follows	VERB	(VERB follows (NOUN sentence (DET Another )) .)	-	-	-	-	-	-	-
10	0	10	.	PUNCT	.	-	-	-	-	-	-	-


In [None]:
pattern = re.compile(r'([A-Z])([,\.\)])([A-Z])')

for irow in range(len(data)):
    data[irow] = re.sub(pattern, r'\1\2 \3',data[irow])

In [None]:
data

0       TAILWHEEL COCKED RIGHT PRIOR TO TKOF.         ...
1       TOW PLANE BECAME AIRBORNE THEN SETTLED. STUDEN...
2       2ND ILS APCH, ACFT'S G/S INOP. LOM TUNED TO WR...
3       PLT NOTED SOFT R BRAKE PEDAL DRG TAXI TO TKOF....
4       TAXI OFF HARD SFC DUE TFC R MAIN GR BROKE THRO...
                              ...                        
2743    (-23) A/C RELOCATED TO NEW HANGAR TO CHECK SIZ...
2744    (-23) ON 2/23/08 @ APPROXIMATELY 2130 DURING T...
2745    (-23) PILOT TOOK OFF FOR LEESBURG AIRPORT AND ...
2746    (-23) OWNER FORGOT TO FASTEN THE LOWER LEFT 4 ...
2747    (-23) THE AIRCRAFT EXPERIENCED SEVERE TURBULAN...
Name: c119, Length: 2748, dtype: object

In [None]:
for irow in range(len(data)):
    split_entry = data[irow].split()
    new_entry = []
    for word in split_entry:
        if word[-1] == '.':
            new_entry.append(word[:-1])
            new_entry.append(r'\.')
        else:
            new_entry.append(word)
    print(new_entry)
    break

['TAILWHEEL', 'COCKED', 'RIGHT', 'PRIOR', 'TO', 'TKOF', '\\.']


In [8]:
docid = "faa"

#for irow in range(len(data)):
for irow in range(5):
    for iword, word in enumerate(data[irow].split()):
        print(f"{docid}\t{irow:04}\t{iword}\t{word}")

faa	0000	0	TAILWHEEL
faa	0000	1	COCKED
faa	0000	2	RIGHT
faa	0000	3	PRIOR
faa	0000	4	TO
faa	0000	5	TKOF.
faa	0001	0	TOW
faa	0001	1	PLANE
faa	0001	2	BECAME
faa	0001	3	AIRBORNE
faa	0001	4	THEN
faa	0001	5	SETTLED.STUDENT
faa	0001	6	THOUGHT
faa	0001	7	TOW
faa	0001	8	IN
faa	0001	9	TROUBLE
faa	0001	10	&
faa	0001	11	RELEASED.HIT
faa	0001	12	TREE.
faa	0002	0	2ND
faa	0002	1	ILS
faa	0002	2	APCH,ACFT'S
faa	0002	3	G/S
faa	0002	4	INOP.LOM
faa	0002	5	TUNED
faa	0002	6	TO
faa	0002	7	WRONG
faa	0002	8	FREQ.
faa	0003	0	PLT
faa	0003	1	NOTED
faa	0003	2	SOFT
faa	0003	3	R
faa	0003	4	BRAKE
faa	0003	5	PEDAL
faa	0003	6	DRG
faa	0003	7	TAXI
faa	0003	8	TO
faa	0003	9	TKOF.FLT
faa	0003	10	RTND
faa	0003	11	SPRINGFIELD
faa	0003	12	DUE
faa	0003	13	SOFT
faa	0003	14	BRAKE
faa	0003	15	STRONG
faa	0003	16	WINDS
faa	0003	17	BOS
faa	0004	0	TAXI
faa	0004	1	OFF
faa	0004	2	HARD
faa	0004	3	SFC
faa	0004	4	DUE
faa	0004	5	TFC
faa	0004	6	R
faa	0004	7	MAIN
faa	0004	8	GR
faa	0004	9	BROKE
faa	0004	10	THROUGH
faa	0004	11	ROOF
faa	0004	12	