#### Imports

In [21]:
import json
import xml.etree.ElementTree as ET
import re
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import unicodedata

#### Data Loader

In [39]:
# read json file and return processed json list
def process_json(json_data_dir):

    with open(json_data_dir, "r") as json_file:
        
        lines = json_file.readlines() # each line contains one json object as str
        
        json_list = [] # a list of dictonaries, where each dictionary is a json object

        for l in lines:
            data = json.loads(l)

            # ucid = data['ucid']
            
            # invention_title = data['invention_title']['text']
            abstract = data['abstract']['text']
            # claims = data['claims']['text']

            # invention_title_processed = process_invention_title(invention_title)
            abstract_processed = process_abstract(abstract)

            # claims_processed = process_claims(claims)

            # final_text = invention_title_processed + ' ' + abstract_processed + ' ' + claims_processed
        
            json_list.append({'abstract':abstract_processed})
            
    
    return json_list

In [23]:
# write to csv file
def write2csv(json_dict, output_dir):
    with open(output_dir, 'w', newline='', encoding='utf-8') as csv_file:
        field_names = ['abstract']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        # writer.writeheader()
        writer.writerows(json_dict)


#### Process XML Components

In [24]:
def process_invention_title(text):
    xml = ET.fromstring(text)
    invention_title = preprocess(xml.text)
    return invention_title

In [38]:
def get_text_within_tags(element):
    text = element.text or ''
    for child in element:
        text += ET.tostring(child, encoding='unicode', method='text')
        if child.tail:
            text += child.tail
    return text


def process_abstract(text):
    p_tag_content = ""
    xml = ET.fromstring(text)
    p_tags = xml.findall('.//p')
    for p in p_tags:
        p_tag_content += get_text_within_tags(p)

    processed = preprocess(p_tag_content)

    return processed

In [26]:
def process_claims(claims_text):
    text = ""
    xml = ET.fromstring(claims_text)

    # extract claims text out of xml
    for claims in xml:
        for claim in claims:
            if claim.text is not None:
                text += claim.text

    text_cleaned = preprocess(text)
    
    return text_cleaned    


#### Preprocess text

In [40]:
def preprocess(text):
    # Normalize Unicode characters
    # text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # print("Normalized Text:", text)


    # Remove newline characters
    text = text.replace('\n', '').replace('\r', '')
    text_cleaned = text
    # print('newline removed: ', text)

    


    # # Lowercasing
    # text = text.lower()

    # # Removing HTML tags
    # text = re.sub(r'<.*?>', '', text)
    # print("html_removed: ", text)

    # # Tokenization

    # pattern = r'\b\w+(?:-\w+)*\b' # \boundary \word (-\word)* \boundary
    # tokens = regexp_tokenize(text, pattern)
    # print("tokenized: ", tokens)

    # Remove punctuation except dashed-words
    # tokens = [re.sub(r'[^\w-]', '', word) for word in tokens]
    # print("punc removed: ", tokens)
    
    # Remove numbers
    # tokens = [word for word in tokens if not word.isdigit()]
    # print("num removed: ", tokens)

    # # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if word not in stop_words]
    # print("stop words removed: ", tokens)

    # Remove empty strings
    # tokens = [word for word in tokens if word != ""]
    # print("empty strings removed: ", tokens)

    # # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # print("lematized: ", tokens)


    # Join the tokens back
    # text_cleaned = ' '.join(tokens)
    # print("final: ", text_cleaned)

    return text_cleaned
    

#### Test

In [41]:
collect_data = process_json('./data/sample_data_2.json')
write2csv(collect_data, './output_data/abstracts_2.csv')