#### Imports

In [46]:
import json
import xml.etree.ElementTree as ET
import re
import csv
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from spellchecker import SpellChecker

#### Data Loader

In [88]:
# read json file and write csv file containing ucid, processed_text 
def json2csv(json_data_dir):

    with open(json_data_dir, "r") as json_file:
        
        lines = json_file.readlines() # each line contains one json object as str
        
        json_list = [] # a list of dictonaries, where each dictionary is a json object

        for l in lines:
            data = json.loads(l)

            ucid = data['ucid']

            claims = data['claims']['text']

            processed_claims_text = process_claims_text(claims)

        
            json_list.append({'ucid':ucid, 'processed_text':processed_claims_text})
    
    return json_list







        # for obj in lines:
        # print(type(lines[1]))
        # print(lines[1])
        # data = json.loads(lines[1])
        # print(type(data))


In [89]:
test = json2csv('./data/sample_data_2.json')
print(test)



In [84]:
print(test)



In [90]:
# write to csv file
def write2csv(json_dict):
    with open('./output_data/output.csv', 'w', newline='') as csv_file:
        field_names = ['ucid', 'processed_text']
        writer = csv.DictWriter(csv_file, fieldnames=field_names)
        writer.writeheader()
        writer.writerows(json_dict)


In [91]:
write2csv(test)

#### Process Claims Text

In [73]:
def process_claims_text(claims_text):
    text = ""
    xml = ET.fromstring(claims_text)

    # extract claims text out of xml
    for claims in xml:
        for claim in claims:
            if claim.text is not None:
                text += claim.text

    # Lowercasing
    text = text.lower()

    # Removing HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    
    # Remove numbers
    tokens = [word for word in tokens if not word.isdigit()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]


    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Initialize the spell checker
    spell = SpellChecker()
    # Spellchecking
    tokens = [spell.correction(word) for word in tokens]

    # Remove any None values that may have been introduced by the SpellChecker
    tokens = [token for token in tokens if token is not None]

    # Remove extra whitespace
    text_cleaned = ' '.join(tokens)
    text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()

    return text_cleaned    


In [61]:
json2csv('./data/sample_data.json')

<class 'dict'>


In [32]:
invention_title = data['invention_title']['text']
abstract = data['abstract']['text']
claims = data['claims']['text']

In [39]:
print(claims)

<claims><claim id="en-cl0001" num="0001"><claim-text>1. a kind of dynamical system of planet row plug-in hybrid-power automobile, including engine（1）, input shaft（2）, the first clutch
Device（4）, outer shaft（5）, include rotor（91）Motor（9）, output shaft（14）And its first sun gear（6）Positioned at row
Carrier（13）Within, the first planetary gear（7）It is rotatably installed on planet carrier（13）Planet axis（131）Upper, the first gear ring（8）
Positioned at planet carrier（13）Except the first planet row, the input shaft（2）Left end and engine（1）Bent axle be connected, institute
State outer shaft（5）Left end and right end respectively with first clutch（4）Interior hub and the first sun gear（6）It is connected, described first
Gear ring（8）With rotor（91）It connects firmly, the planet carrier（13）Right end and output shaft（14）It connects firmly, it is characterised in that：Institute
State outer shaft（5）Further include second clutch for hollow shaft（3）, inner shaft（10）And its second sun gear（11）Positioned at

In [34]:
xml = ET.fromstring(claims)

In [40]:
claims_text = ""

for child in xml:
    print(child.tag, child.attrib, child.text)
    for c in child:
        
        print(c.text)
        if c.text is not None:

            claims_text += c.text


    

claim {'id': 'en-cl0001', 'num': '0001'} None
1. a kind of dynamical system of planet row plug-in hybrid-power automobile, including engine（1）, input shaft（2）, the first clutch
Device（4）, outer shaft（5）, include rotor（91）Motor（9）, output shaft（14）And its first sun gear（6）Positioned at row
Carrier（13）Within, the first planetary gear（7）It is rotatably installed on planet carrier（13）Planet axis（131）Upper, the first gear ring（8）
Positioned at planet carrier（13）Except the first planet row, the input shaft（2）Left end and engine（1）Bent axle be connected, institute
State outer shaft（5）Left end and right end respectively with first clutch（4）Interior hub and the first sun gear（6）It is connected, described first
Gear ring（8）With rotor（91）It connects firmly, the planet carrier（13）Right end and output shaft（14）It connects firmly, it is characterised in that：Institute
State outer shaft（5）Further include second clutch for hollow shaft（3）, inner shaft（10）And its second sun gear（11）Positioned at planet

In [41]:
print(claims_text)
text = claims_text

1. a kind of dynamical system of planet row plug-in hybrid-power automobile, including engine（1）, input shaft（2）, the first clutch
Device（4）, outer shaft（5）, include rotor（91）Motor（9）, output shaft（14）And its first sun gear（6）Positioned at row
Carrier（13）Within, the first planetary gear（7）It is rotatably installed on planet carrier（13）Planet axis（131）Upper, the first gear ring（8）
Positioned at planet carrier（13）Except the first planet row, the input shaft（2）Left end and engine（1）Bent axle be connected, institute
State outer shaft（5）Left end and right end respectively with first clutch（4）Interior hub and the first sun gear（6）It is connected, described first
Gear ring（8）With rotor（91）It connects firmly, the planet carrier（13）Right end and output shaft（14）It connects firmly, it is characterised in that：Institute
State outer shaft（5）Further include second clutch for hollow shaft（3）, inner shaft（10）And its second sun gear（11）Positioned at planet carrier
（13）Within, the second planetary gear

In [13]:
# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Novojit\AppData\Roaming\nltk_data...


True

In [45]:

# # Sample text with various preprocessing needs
# text = "Hello world! This is a sample text for NLP preprocessing using NLTK. Let's remove stop words, punctuation, and perform lemmatization. Also, handle Unicode characters like café, naïve, and coöperate. Plus, check numbers 123 and HTML tags <div>Hello</div>."

# # Normalize Unicode characters
# text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
# print("Normalized Text:", text)

# Lowercasing
text = text.lower()
print("Lowercased Text:", text)

# Removing HTML tags
text = re.sub(r'<.*?>', '', text)
print("Text without HTML tags:", text)

# Tokenization
tokens = word_tokenize(text)
print("Tokens:", tokens)

# Remove punctuation
tokens = [word for word in tokens if word.isalnum()]
print("Tokens without punctuation:", tokens)

# Remove stop words
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]
print("Tokens without stop words:", tokens)

# Remove numbers
tokens = [word for word in tokens if not word.isdigit()]
print("Tokens without numbers:", tokens)

# Lemmatization
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(word) for word in tokens]
print("Lemmatized tokens:", tokens)

# Initialize the spell checker
spell = SpellChecker()
# Spellchecking
tokens = [spell.correction(word) for word in tokens]
print("Tokens after spellchecking:", tokens)

# Remove extra whitespace
text_cleaned = ' '.join(tokens)
text_cleaned = re.sub(r'\s+', ' ', text_cleaned).strip()
print("Text without extra whitespace:", text_cleaned)


Lowercased Text: 1. a kind of dynamical system of planet row plug-in hybrid-power automobile, including engine（1）, input shaft（2）, the first clutch
device（4）, outer shaft（5）, include rotor（91）motor（9）, output shaft（14）and its first sun gear（6）positioned at row
carrier（13）within, the first planetary gear（7）it is rotatably installed on planet carrier（13）planet axis（131）upper, the first gear ring（8）
positioned at planet carrier（13）except the first planet row, the input shaft（2）left end and engine（1）bent axle be connected, institute
state outer shaft（5）left end and right end respectively with first clutch（4）interior hub and the first sun gear（6）it is connected, described first
gear ring（8）with rotor（91）it connects firmly, the planet carrier（13）right end and output shaft（14）it connects firmly, it is characterised in that：institute
state outer shaft（5）further include second clutch for hollow shaft（3）, inner shaft（10）and its second sun gear（11）positioned at planet carrier
（13）within, the seco