#### Imports

In [3]:
import xml.etree.ElementTree as ET
import re
import json
import csv

#### Remove newline character ('\n') and HTML tags

In [4]:
def clean_text(text):
    # Remove newline character
    text = text.replace('\n', '')

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    return text

#### Process XML strings

In [5]:
# Extract Invention Title
def get_invention_title(text):
    xml = ET.fromstring(text)
    invention_title = clean_text(xml.text)
    return invention_title

In [6]:
# Extract Abstract

# Extract sibling <p> tags' text
def get_sibling_tags(element):
    text = element.text or ''
    for child in element:
        text += ET.tostring(child, encoding='unicode', method='text')
        if child.tail:
            text += child.tail
    return text

# Concatenate sibling <p> tags and return result
def get_abstract(text):
    p_tag_content = ""
    xml = ET.fromstring(text)
    p_tags = xml.findall('.//p')
    for p in p_tags:
        p_tag_content += get_sibling_tags(p)

    abstract = clean_text(p_tag_content)

    return abstract

In [21]:
# Extract <claims> text
def get_claims(claims_text):
    text = ""
    xml = ET.fromstring(claims_text)
    claims = xml.findall('.//claim-text')

    # extract claims text out of xml
    for claim in claims:
        text += get_sibling_tags(claim) 
    result = clean_text(text)
    
    return result 

#### Process JSON Data

In [8]:
# Load JSON data and return [{ucid, title, abstract, claims}]
def process_json(json_data_dir):

    with open(json_data_dir, "r") as json_file:
        
        lines = json_file.readlines() # each line contains one json object as str
        
        json_list = [] # a list of dictonaries, where each dictionary is a json object

        for l in lines:
            data = json.loads(l) # load json object

            ucid = data['ucid']
    
            invention_title_xml = data['invention_title']['text']
            abstract_xml = data['abstract']['text']
            claims_xml = data['claims']['text']

            title = get_invention_title(invention_title_xml)
            abstract = get_abstract(abstract_xml)
            claims = get_claims(claims_xml)
        
            json_list.append({'ucid':ucid,'title':title, 'abstract':abstract, 'claims':claims})
            
    
    return json_list

#### Write JSON list to CSV file

In [10]:
# write to csv file
def write2csv(json_list, output_dir):
    with open(output_dir, 'w', newline='', encoding='utf-8') as csv_file:
        headers = ['ucid','title','abstract','claims']
        writer = csv.DictWriter(csv_file, fieldnames=headers)
        writer.writeheader()
        writer.writerows(json_list)

#### Invoke the functions

In [22]:
# collect json data from json file file directory
# json_data = process_json(json_data_dir='./data/case_data.json')

# write the json data to a csv file 
# write2csv(json_list=json_data, output_dir='./output_data/case_data_2.csv')