# XML Parser

XML Parser for FCE dataset (https://www.ilexir.co.uk/datasets/index.html).

In [1]:
import pandas as pd
import os

In [39]:
# returns list of <p></p> objects for a given XML
def get_p_lst(s):
    ps = []
    s_len = len(s)
    for i in range(s_len):
        if s[i:i+3] == "<p>":
            end_i = i
            while s[end_i:end_i+4] != "</p>":
                end_i += 1
            end_i += 4
            ps.append(s[i:end_i])
    return ps

In [40]:
# returns the incorrect word and the end index of <i></i> object
def get_i(p_str, i):
    p_i = i
    NS_end = False
#     print("get i ", i)
    while p_str[p_i:p_i+3] != "<i>" and p_i != len(p_str):
        if p_str[p_i:p_i+5] == "</NS>":
#             print("</NS>!!")
            NS_end == True
            return "", i
        p_i += 1
    start_i = p_i + 3
    if p_i == len(p_str) or NS_end:
#         print("no i")
        return "", i
    
    p_i = start_i
    while p_str[p_i:p_i+4] != "</i>":
        p_i += 1
    end_i = p_i 
    
    word = p_str[start_i:end_i]
    
    return word, end_i+4

# returns the correct word and the end index of <c></c> object
def get_c(p_str, i):
    
    p_i = i
    NS_end = False
    
    while p_str[p_i:p_i+3] != "<c>" and p_i != len(p_str):
        if p_str[p_i:p_i+5] == "</NS>":
            NS_end == True
            return "", i
        p_i += 1
    start_i = p_i + 3
    if p_i == len(p_str) or NS_end:
#         print("no c")
        return "", i
    
    p_i = start_i
    while p_str[p_i:p_i+4] != "</c>":
        p_i += 1
    end_i = p_i 
    
    word = p_str[start_i:end_i]
    
    return word, end_i+4
    


# count cascaded <NS> objects
def num_NS_inside_NS(p_str):
    p_i = 0
    open_count = 0
    close_count = 0
    tags = 0
    start = 0
    end = 0
    while p_i != len(p_str):
        if p_str[p_i:p_i+4] == "<NS ":
            if open_count != close_count:
                tags += 1
                if tags == 1:
                    start = p_i
                
            open_count += 1
        elif p_str[p_i:p_i+5] == "</NS>":
            close_count -= 1
        p_i += 1
    return tags

# get the start and the end index of <NS> object
def find_NS(p_str,i_c):
    p_i = i_c
    
    count = 0
    
#     print(len(p_str))
    end = 0
    start = 0
    done = False
    
    while not done:
        if p_str[p_i:p_i+4] == "<NS ":
            if count == 0:
                start = p_i
            count += 1
        elif p_str[p_i:p_i+5] == "</NS>":
            if count == 1:
                end = p_i+5
                done = True
            count -= 1
        p_i += 1
    return start, end

In [41]:
# for a list of <p> objects, return a list of correct sentences and a list of corresponding incorrect sentences
def correct_and_incorrect_lists(ps):
    correct_sntnc = []
    incorrect_sntnc = []

    for i in range(len(ps)):
        correct = ""
        incorrect = ""
        p_tag = ps[i][3:-4]

        i_c = 0
        while i_c != len(p_tag):

            if p_tag[i_c:i_c+10] == '<NS type="':


                start_NS, end_NS = find_NS(p_tag,i_c)

                if num_NS_inside_NS(p_tag[start_NS: end_NS]) != 0:

                    start_subNS, end_subNS = find_NS(p_tag,i_c+1)
                    sliced_p_tag = p_tag[i_c:start_subNS]+p_tag[end_subNS:]

                    incorrect_w, end_i = get_i(sliced_p_tag, 0)

                    correct_w, end_i = get_c(sliced_p_tag, end_i)

                    incorrect += incorrect_w
                    correct += correct_w


                else:    
                    incorrect_w, end_i = get_i(p_tag, i_c)

                    correct_w, end_i = get_c(p_tag, end_i)

                    incorrect += incorrect_w
                    correct += correct_w

                i_c = end_NS
            else:
                correct += p_tag[i_c]
                incorrect += p_tag[i_c]
                i_c += 1


        correct.replace("  ", " ")
        incorrect.replace("  ", " ")
        correct_sntnc.append(correct)
        incorrect_sntnc.append(incorrect)
    
    return correct_sntnc, incorrect_sntnc
    

In [None]:
import xml.etree.ElementTree as ET
import os

data_dir = "/Users/sangencre/Downloads/fce-released-dataset/dataset"

def get_all_filepaths(data_dir):
    return [os.path.join(root, fname)
            for (root, dir_names, file_names) in os.walk(data_dir, followlinks=True)
            for fname in file_names
            if (fname.lower().endswith('.xml'))]

import signal

class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException



def parse_xml(data_dir):
    filepaths = get_all_filepaths(data_dir)
    d = {"correct": [], "incorrect": []}
#     filepaths =['/Users/sangencre/Downloads/fce-released-dataset/dataset/0102_2000_6/doc838.xml']

    # Change the behavior of SIGALRM
    signal.signal(signal.SIGALRM, timeout_handler)


        
    for path in filepaths:
        signal.alarm(15)
        try:
#             print("parsing", path)
            tree = ET.parse(path)
            root = tree.getroot()
            xmlstr = ET.tostring(root, encoding='utf8', method='xml').decode("utf-8")

            p_list = get_p_lst(xmlstr)

            c_lst, i_lst = correct_and_incorrect_lists(p_list)

            d["correct"] += c_lst
            d["incorrect"] += i_lst
        except TimeoutException:
            continue # continue the for loop if function A takes more than 5 second
        else:
            # Reset the alarm
            signal.alarm(0)
            
    return d
        
parsed_d = parse_xml(data_dir)

In [48]:
delete_ind = []
d_i = 0
dq = '""'
while d_i != len(parsed_d['correct']):
    if parsed_d['correct'][d_i] == parsed_d['incorrect'][d_i]:
        del parsed_d['correct'][d_i]
        del parsed_d['incorrect'][d_i]
    elif "<NS" in parsed_d['correct'][d_i] or "<NS" in parsed_d['incorrect'][d_i]:
        del parsed_d['correct'][d_i]
        del parsed_d['incorrect'][d_i]
    elif dq in parsed_d['correct'][d_i] or dq in parsed_d['incorrect'][d_i]:
        del parsed_d['correct'][d_i]
        del parsed_d['incorrect'][d_i]
    else:
        d_i += 1



In [49]:
import pandas as pd

df = pd.DataFrame(data=parsed_d)

In [50]:
df.head(26)

Unnamed: 0,correct,incorrect
0,I WANT TO THANK YOU FOR PREPARING SUCH A GOOD ...,I WANT TO THAK YOU FOR PREPARING SUCH A GOOD P...
1,I WILL BE WAITING ANXIOUSLY FOR YOUR RESPONSE.,I WILL BE WRITING ANXIOUSLY FOR YOUR RESPONSE.
2,"FAMOUS PEOPLE SUCH AS SINGERS, FILM STARS, ETC...","FAMOUS PEOPLE SUCH AS SINGERS, FILM STARS, ETC..."
3,THIS SITUATION AFFECTS FAMOUS PEOPLE BECAUSE T...,THIS SITUATION AFFECTS FAMOUS PERSONAL LIVES B...
4,"THE MOST IMPORTANT THING IS THAT, ALTHOUGH THE...","THE MOST IMPORTANT THING IS THAT, ALTHOUGH THE..."
5,IN MY OPINION FAMOUS PEOPLE ARE BEING OBLIGED ...,IN MY OPINION FAMOUS PEOPLE ARE BEING OBLIGED ...
6,I have seen your programme for the trip and I ...,I have seen your programme for the trip and I ...
7,This show is on the 14th March. It is in the C...,This show is on the 14th March. It is in the C...
8,I think it is a great opportunity because it i...,I think it is a great opportunity because it i...
9,There will be lots of differences in our homes...,There will be lots of differences in our homes...


In [51]:
df.to_csv(path_or_buf="./parsed.tsv", sep='\t', header=False, index=False, index_label=None, mode='w')

In [2]:
data_dir = '/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser'
def get_all_filepaths(data_dir):
    return [os.path.join(root, fname)
            for (root, dir_names, file_names) in os.walk(data_dir, followlinks=True)
            for fname in file_names
            if (fname.lower().endswith('.txt'))]

filepaths = get_all_filepaths(data_dir)
print(filepaths)

['/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/ref1.txt', '/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/ref2.txt', '/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/ref3.txt', '/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/ref4.txt', '/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/src.txt']


In [6]:
pd.read_csv('/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/ref1.txt', sep='###', header=0)

  """Entry point for launching an IPython kernel.


TypeError: Passing a bool to header is invalid. Use header=None for no header or header=int or list-like of ints to specify the row(s) making up the column names

In [5]:
pd.read_csv('/Users/sangencre/Desktop/NYU/spring_18/dl/XML_parser/src.txt', sep='###')

  """Entry point for launching an IPython kernel.


Unnamed: 0,So I think we can not live if old people could not find siences and tecnologies and they did not developped .
0,For not use car .
1,Here was no promise of morning except that we ...
2,Thus even today sex is considered as the least...
3,image you salf you are wark in factory just to...
4,"They draw the consumers , like me , to purchas..."
5,I want to talk about nocive or bad products li...
6,For example they can play football whenever th...
7,It figures Diana Krall wearing a Rolex watch a...
8,There are several reason .
9,"Also the work of the media , television , maga..."
