#### Imports

In [35]:
import json
import xml.etree.ElementTree as ET
import re
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

#### Data Loader

In [85]:
a = 'engine (1)'
c = '1.'
b = word_tokenize(c)
print(b)

['1', '.']


In [82]:
# read json file and return processed json list
def process_json(json_data_dir):

    with open(json_data_dir, "r") as json_file:
        
        lines = json_file.readlines() # each line contains one json object as str
        
        json_list = [] # a list of dictonaries, where each dictionary is a json object

        for l in lines:
            data = json.loads(l)

            ucid = data['ucid']
            
            invention_title = data['invention_title']['text']
            abstract = data['abstract']['text']
            claims = data['claims']['text']

            invention_title_processed = process_invention_title(invention_title)
            abstract_processed = process_abstract(abstract)
            claims_processed = process_claims(claims)

            final_text = invention_title_processed + ' ' + abstract_processed + ' ' + claims_processed
        
            json_list.append({'ucid':ucid, 'text':final_text})
    
    return json_list

In [83]:
process_json('./data/sample_data_2.json')

html_removed:  hydraulic regeneative braking and power assist apparatus for hev
tokenized:  ['hydraulic', 'regeneative', 'braking', 'and', 'power', 'assist', 'apparatus', 'for', 'hev']
punc removed:  ['hydraulic', 'regeneative', 'braking', 'and', 'power', 'assist', 'apparatus', 'for', 'hev']
num removed:  ['hydraulic', 'regeneative', 'braking', 'and', 'power', 'assist', 'apparatus', 'for', 'hev']
stop words removed:  ['hydraulic', 'regeneative', 'braking', 'power', 'assist', 'apparatus', 'hev']
empty strings removed:  ['hydraulic', 'regeneative', 'braking', 'power', 'assist', 'apparatus', 'hev']
lematized:  ['hydraulic', 'regeneative', 'braking', 'power', 'assist', 'apparatus', 'hev']
final:  hydraulic regeneative braking power assist apparatus hev
html_removed:  a hydraulic regenerative braking and power support device for the hybrid vehicle is provided to improve the total efficiency of the hybrid vehicle since the unavailable energy generated in the internal combustion engine is uti

[{'ucid': 'KR-100872632-B1',
  'text': 'hydraulic regeneative braking power assist apparatus hev hydraulic regenerative braking power support device hybrid vehicle provided improve total efficiency hybrid vehicle since unavailable energy generated internal combustion engine utilized order maintain isothermal condition compression expansion compressed air hydraulic regenerative braking power support device hybrid vehicle comprises high pressure storage tank compressed air charged top tank inside working fluid stored lower part tank inside heat exchanger heat-exchanges compressed air top inside high pressure storage tank fluid compressor raise pressure working fluid flowed low pressure storage tank transfer store working fluid high pressure storage tank risen temperature compressed air cooled heat exchanger working fluid storage high pressure storage tank hydraulic regenerative braking power assist device hybrid vehicle upper portion tank filled compressed air lower portion tank inside h

In [75]:
# write to csv file
def write2csv(json_dict):
    with open('./output_data/output.csv', 'w', newline='') as csv_file:
        field_names = ['ucid', 'text']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        writer.writerows(json_dict)


#### Process XML Components

In [63]:
def process_invention_title(text):
    xml = ET.fromstring(text)
    invention_title = preprocess(xml.text)
    return invention_title

In [61]:
def process_abstract(text):
    xml = ET.fromstring(text)
    p_tag_content = xml.find('p').text
    processed = preprocess(p_tag_content)
    return processed

In [62]:
def process_claims(claims_text):
    text = ""
    xml = ET.fromstring(claims_text)

    # extract claims text out of xml
    for claims in xml:
        for claim in claims:
            if claim.text is not None:
                text += claim.text

    text_cleaned = preprocess(text)
    
    return text_cleaned    


#### Preprocess text

In [81]:
def preprocess(text):
    # Lowercasing
    text = text.lower()

    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)
    print("html_removed: ", text)

    # Tokenization
    tokens = word_tokenize(text)
    print("tokenized: ", tokens)

    # Remove punctuation except dashed-words
    tokens = [re.sub(r'[^\w-]', '', word) for word in tokens]
    print("punc removed: ", tokens)
    
    # Remove numbers
    tokens = [word for word in tokens if not word.isdigit()]
    print("num removed: ", tokens)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    print("stop words removed: ", tokens)

    # Remove empty strings
    tokens = [word for word in tokens if word != ""]
    print("empty strings removed: ", tokens)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    print("lematized: ", tokens)


    # Join the tokens back
    text_cleaned = ' '.join(tokens)
    print("final: ", text_cleaned)

    return text_cleaned
    

#### Test

In [70]:
collect_data = process_json('./data/sample_data_2.json')
write2csv(collect_data)

html_removed:  hydraulic regeneative braking and power assist apparatus for hev
tokenized:  ['hydraulic', 'regeneative', 'braking', 'and', 'power', 'assist', 'apparatus', 'for', 'hev']
punc removed:  ['hydraulic', 'regeneative', 'braking', 'and', 'power', 'assist', 'apparatus', 'for', 'hev']
num removed:  ['hydraulic', 'regeneative', 'braking', 'and', 'power', 'assist', 'apparatus', 'for', 'hev']
stop words removed:  ['hydraulic', 'regeneative', 'braking', 'power', 'assist', 'apparatus', 'hev']
empty strings removed:  ['hydraulic', 'regeneative', 'braking', 'power', 'assist', 'apparatus', 'hev']
lematized:  ['hydraulic', 'regeneative', 'braking', 'power', 'assist', 'apparatus', 'hev']
final:  hydraulic regeneative braking power assist apparatus hev
html_removed:  a hydraulic regenerative braking and power support device for the hybrid vehicle is provided to improve the total efficiency of the hybrid vehicle since the unavailable energy generated in the internal combustion engine is uti

NameError: name 'processed_claims_text' is not defined

In [13]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...


True