#### Imports

In [1]:
import json
import xml.etree.ElementTree as ET
import re
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, regexp_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker
import unicodedata

#### Data Loader

In [19]:
# read json file and return processed json list
def process_json(json_data_dir):

    with open(json_data_dir, "r") as json_file:
        
        lines = json_file.readlines() # each line contains one json object as str
        
        json_list = [] # a list of dictonaries, where each dictionary is a json object

        for l in lines:
            data = json.loads(l)

            # ucid = data['ucid']
            
            # invention_title = data['invention_title']['text']
            abstract = data['abstract']['text']
            # claims = data['claims']['text']

            # invention_title_processed = process_invention_title(invention_title)
            abstract_processed = process_abstract(abstract)
            # claims_processed = process_claims(claims)

            # final_text = invention_title_processed + ' ' + abstract_processed + ' ' + claims_processed
        
            json_list.append({'abstract':abstract_processed})
    
    return json_list

In [3]:
# write to csv file
def write2csv(json_dict, output_dir):
    with open(output_dir, 'w', newline='', encoding='utf-8') as csv_file:
        field_names = ['abstract']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        # writer.writeheader()
        writer.writerows(json_dict)


#### Process XML Components

In [4]:
def process_invention_title(text):
    xml = ET.fromstring(text)
    invention_title = preprocess(xml.text)
    return invention_title

In [45]:
def get_text_within_tags(element):
    text = element.text or ''
    for child in element:
        text += ET.tostring(child, encoding='unicode', method='text')
        if child.tail:
            text += child.tail
    return text


def process_abstract(text):
    p_tag_content = ""
    xml = ET.fromstring(text)
    p_tags = xml.findall('.//p')
    for p in p_tags:
        p_tag_content += get_text_within_tags(p)
    processed = preprocess(p_tag_content)
    return processed

In [40]:
a="<abstract><p>Periodic shutdown of the internal combustion engine (<b>12</b>) during operation of a hybrid electric vehicle (HEV) is achieved by shutdown of a vapor management valve (VMV) of the engine evaporative emission control system and an EGR valve (<b>150</b>) of the tailpipe emission control system at the time an engine shutdown command is provided to a controlled engine shutdown routine that, after closing of the VMV and EGR valves, then commands disabling of the engine fuel injectors (<b>160</b>) in a manner to stop engine operation.</p></abstract>"
xml = ET.fromstring(a)
p_cont = xml.findall('.//p')
print(p_cont[0].text)

Periodic shutdown of the internal combustion engine (


In [6]:
def process_claims(claims_text):
    text = ""
    xml = ET.fromstring(claims_text)

    # extract claims text out of xml
    for claims in xml:
        for claim in claims:
            if claim.text is not None:
                text += claim.text

    text_cleaned = preprocess(text)
    
    return text_cleaned    


#### Preprocess text

In [29]:
def preprocess(text):
    # Normalize Unicode characters
    # text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # print("Normalized Text:", text)


    # Remove newline characters
    text = text.replace('\n', '').replace('\r', '')
    print('newline removed: ', text)

    


    # # Lowercasing
    # text = text.lower()

    # # Removing HTML tags
    # text = re.sub(r'<.*?>', '', text)
    # print("html_removed: ", text)

    # # Tokenization

    # pattern = r'\b\w+(?:-\w+)*\b' # \boundary \word (-\word)* \boundary
    # tokens = regexp_tokenize(text, pattern)
    # print("tokenized: ", tokens)

    # # Remove punctuation except dashed-words
    # tokens = [re.sub(r'[^\w-]', '', word) for word in tokens]
    # print("punc removed: ", tokens)
    
    # # Remove numbers
    # tokens = [word for word in tokens if not word.isdigit()]
    # print("num removed: ", tokens)

    # # Remove stop words
    # stop_words = set(stopwords.words('english'))
    # tokens = [word for word in tokens if word not in stop_words]
    # print("stop words removed: ", tokens)

    # # Remove empty strings
    # tokens = [word for word in tokens if word != ""]
    # print("empty strings removed: ", tokens)

    # # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # print("lematized: ", tokens)


    # # Join the tokens back
    # text_cleaned = ' '.join(tokens)
    # print("final: ", text_cleaned)

    return text
    

#### Test

In [46]:
collect_data = process_json('./data/sample_data_2.json')
write2csv(collect_data, './output_data/abstracts.csv')

newline removed:  A hydraulic regenerative braking and power support device for the hybrid vehicle is provided to improve the total efficiency of the hybrid vehicle since the unavailable energy generated in the internal combustion engine is utilized in order to maintain the isothermal condition in the compression and expansion of the compressed air. A hydraulic regenerative braking and power support device for the hybrid vehicle comprises a high pressure storage tank(10) in which the compressed air is charged on the top of the tank inside, and the working fluid is stored in the lower part of the tank inside, a heat exchanger which heat-exchanges he compressed air(11) on the top of the inside of the high pressure storage tank, a fluid compressor which raises the pressure of the working fluid flowed in from the low pressure storage tank, and transfers and stores the working fluid to the high pressure storage tank. The risen temperature of the compressed air is cooled by the heat exchange

In [13]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...


True

In [27]:
process_json('./data/sample_data_3.json')

html_removed:  a kind of dynamical system of planet row plug-in hybrid-power automobile
tokenized:  ['a', 'kind', 'of', 'dynamical', 'system', 'of', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile']
punc removed:  ['a', 'kind', 'of', 'dynamical', 'system', 'of', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile']
num removed:  ['a', 'kind', 'of', 'dynamical', 'system', 'of', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile']
stop words removed:  ['kind', 'dynamical', 'system', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile']
empty strings removed:  ['kind', 'dynamical', 'system', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile']
lematized:  ['kind', 'dynamical', 'system', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile']
final:  kind dynamical system planet row plug-in hybrid-power automobile
html_removed:  higher, the lower planet row plug-in hybrid-power automobile of oil consumption dynamical system that the invention discloses a kind of

[{'text': 'kind dynamical system planet row plug-in hybrid-power automobile higher lower planet row plug-in hybrid-power automobile oil consumption dynamical system invention discloses kind efficiency characterised thatouter shaft5further include second clutch hollow shaft3 inner shaft10and second sun gear11positioned planet carrier13within second planetary gear12it rotatably installed planet carrier13planet axis131on second planet row second clutch3external drum first clutch4external drum connect firmly input shaft2right end connected inner shaft10left right end stretch outer shaft respectively5left right end right end second sun gear11it connects firmly left end second clutch3interior hub connected first planetary gear first planet row7with second planetary gear second planet row12it connects firmly left right installed planet axis side side131on kind dynamical system planet row plug-in hybrid-power automobile including engine1 input shaft2 first clutch device4 outer shaft5 include r

In [32]:
test_str = "higher, the lower planet row plug-in hybrid-power automobile of oil consumption dynamical system that the invention discloses a kind of efficiency, it is characterised in that：outer shaft（5）further include second clutch for hollow shaft（3）"
pattern = r'\b\w+(?:-\w+)*\b'
tokens = regexp_tokenize(test_str, pattern)
print("Custom Tokenized:", tokens)


Custom Tokenized: ['higher', 'the', 'lower', 'planet', 'row', 'plug-in', 'hybrid-power', 'automobile', 'of', 'oil', 'consumption', 'dynamical', 'system', 'that', 'the', 'invention', 'discloses', 'a', 'kind', 'of', 'efficiency', 'it', 'is', 'characterised', 'in', 'that', 'outer', 'shaft', '5', 'further', 'include', 'second', 'clutch', 'for', 'hollow', 'shaft', '3']


In [None]:
preprocess('./output_data/sample_data_3.json')