### Import Libraries

In [1]:
import re
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Preprocessing

In [2]:
# read data (data yang digunakan adalah data sample 500 baris agar pemrosesannya tidak lama)

data = pd.read_excel('data/patent.xlsx')
data.head()

Unnamed: 0,title_and_abstract
0,Recognizing typewritten and handwritten charac...
1,Aspect ratio conversion with machine learning....
2,Indoor scene structural estimation system and ...
3,"Soft robotic actuators for positioning, packag..."
4,System elasticity recovery method and system b...


In [3]:
# check missing value

print(data.isna().sum())

title_and_abstract    0
dtype: int64


In [4]:
# # remove missing value (tidak perlu karena kolom title and abstract tidak memiliki missing value)

# data.dropna(inplace=True)
# data.head()

In [5]:
# lowering text
data['title_and_abstract']= data['title_and_abstract'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['title_and_abstract'].head()

0    recognizing typewritten and handwritten charac...
1    aspect ratio conversion with machine learning....
2    indoor scene structural estimation system and ...
3    soft robotic actuators for positioning, packag...
4    system elasticity recovery method and system b...
Name: title_and_abstract, dtype: object

In [6]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove punctuation and sopword
def tokenize_and_remove_punctuation(text):
    # Define stopword
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?,;])\s+', text)
    
    # Tokenize each word in each sentence, remove punctuation, and remove stopword
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Remove stopword
        tokens = [token for token in tokens if token not in stop_words]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

# Tokenize each sentence and remove punctuation & stopword
data['tokenized'] = data['title_and_abstract'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
print(data['tokenized'][0])

[['recognizing', 'typewritten', 'handwritten', 'characters', 'using', 'end-to-end', 'deep', 'learning'], ['disclosed', 'herein', 'system'], ['method'], ['computer', 'program', 'product', 'embodiments', 'optical', 'character', 'recognition', 'using', 'end-to-end', 'deep', 'learning'], ['embodiment'], ['optical', 'character', 'recognition', 'system', 'may', 'train', 'neural', 'network', 'identify', 'characters', 'pixel', 'images'], ['assign', 'index', 'values', 'characters'], ['recognize', 'different', 'formatting', 'characters'], ['distinguishing', 'handwritten', 'typewritten', 'characters'], ['neural', 'network', 'may', 'also', 'trained', 'identify'], ['groups', 'characters', 'generate', 'bounding', 'boxes', 'group', 'characters'], ['optical', 'character', 'recognition', 'system', 'may', 'analyze', 'documents', 'identify', 'character', 'information', 'based', 'pixel', 'data', 'produce', 'segmentation', 'masks'], ['type', 'grid', 'segmentation', 'mask'], ['one', 'bounding', 'box', 'mask

### POS Tag

In [7]:
# Function to perform POS tagging on each token
def pos_tag_tokens(tokenized_sentences):
    pos_tagged_sentences = []
    for sentence_tokens in tokenized_sentences:
        pos_tags = pos_tag(sentence_tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

# Perform POS tagging on tokenized sentences
data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# Print the updated DataFrame
print(data['pos_tagged'][0])

[[('recognizing', 'VBG'), ('typewritten', 'JJ'), ('handwritten', 'JJ'), ('characters', 'NNS'), ('using', 'VBG'), ('end-to-end', 'JJ'), ('deep', 'JJ'), ('learning', 'NN')], [('disclosed', 'VBN'), ('herein', 'NN'), ('system', 'NN')], [('method', 'NN')], [('computer', 'NN'), ('program', 'NN'), ('product', 'NN'), ('embodiments', 'VBZ'), ('optical', 'JJ'), ('character', 'NN'), ('recognition', 'NN'), ('using', 'VBG'), ('end-to-end', 'JJ'), ('deep', 'JJ'), ('learning', 'NN')], [('embodiment', 'NN')], [('optical', 'JJ'), ('character', 'NN'), ('recognition', 'NN'), ('system', 'NN'), ('may', 'MD'), ('train', 'VB'), ('neural', 'JJ'), ('network', 'NN'), ('identify', 'NN'), ('characters', 'NNS'), ('pixel', 'VBP'), ('images', 'NNS')], [('assign', 'JJ'), ('index', 'NN'), ('values', 'NNS'), ('characters', 'NNS')], [('recognize', 'VB'), ('different', 'JJ'), ('formatting', 'NN'), ('characters', 'NNS')], [('distinguishing', 'VBG'), ('handwritten', 'PRP'), ('typewritten', 'JJ'), ('characters', 'NNS')], [(

In [8]:
# import csv

# # Save the pos_tagged data to a CSV file with UTF-8 encoding
# with open('pos_tagged_data.csv', 'w', newline='', encoding='utf-8') as f:
#     writer = csv.writer(f)
#     writer.writerows(data['pos_tagged'])


In [9]:
# import ast

# pos_tagged = pd.read_excel('pos_tagged_data.xlsx')
# data['pos_tagged'] = pos_tagged

In [11]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged'].apply(lemmatize_tokens)

# Print the updated DataFrame
print(data['lemmatized'][0])

MemoryError: 

### POS Chunk

In [None]:
# define pattern
pattern_raharjana = r'''Chunk: {<VB\w*>+<.|IN|CC|PRP\w*>*<DT>*<NN\w*|RB>+}'''
pattern_webb = r'''Chunk: {<VB\w*><NN\w*>}'''
pattern_johann = r'''Chunk: {<VB\w*><NN\w*>|<VB\w*><PRP\w*><NN\w*>|<VB\w*><NN\w*><NN\w*>|<VB\w*><JJ\w*><NN\w*>|<VB\w*><DT><NN\w*>|<VB\w*><NN\w*><IN><NN\w*>|<VB\w*><IN><JJ\w*><NN\w*>|<VB\w*><PRP\w*><JJ\w*><NN\w*>}'''

def extract(tagged_texts, pattern):
    chunk_parser = RegexpParser(pattern)
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['pattern_webb'] = data['lemmatized'].apply(lambda x: extract(x, pattern_webb))
data['pattern_raharjana'] = data['lemmatized'].apply(lambda x: extract(x, pattern_raharjana))
data['pattern_johann'] = data['lemmatized'].apply(lambda x: extract(x, pattern_johann))

# Print the resulting DataFrame with chunked data
data['pattern_webb']





0         [(include, humidity), (electrode, layer), (ins...
1         [(distribute, weight), (establish, information...
2         [(relate, kind), (use, model), (disaggregated,...
3         [(put, default), (find, father), (define, sub-...
4         [(dictionary；based, language), (extend, form),...
                                ...                        
138780    [(improve, performance), (recognise, traffic),...
138781    [(perform, computer), (comprise, step), (acqui...
138782    [(comprise, head), (comprise, head), (submit, ...
138783    [(base, machine), (predict, method), (base, ma...
138784    [(microwave, source), (microwave, radiation), ...
Name: pattern_webb, Length: 138785, dtype: object

### Export Result

In [12]:
# hasile

# patent_task = data[["title_and_abstract", "pattern_webb", "pattern_raharjana", "pattern_johann"]]
data.to_excel('data/keyword_extraction_patent.xlsx', index=False)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\oktav\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\io\formats\excel.py", line 940, in write
    writer._write_cells(
  File "c:\Users\oktav\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\io\excel\_openpyxl.py", line 489, in _write_cells
    xcell.value, fmt = self._value_with_fmt(cell.val)
  File "c:\Users\oktav\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\io\excel\_base.py", line 1296, in _value_with_fmt
MemoryError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Users\oktav\AppData\Roaming\Python\Python38\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
  File "C:\Users\oktav\AppData\Local\Temp\ipykernel_19440\268945012.py", line 4, in <module>
  File "c:\Users\oktav\AppData\Local\Programs\Python\Python38-32\lib\site-packages\pandas\core\generic.py", line 2252, in to_excel
  File 

: 