### Import Libraries

In [None]:
import re
import pandas as pd
import string
import spacy
from spacy.matcher import Matcher
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# read data

data = pd.read_excel('keyword_extraction_eval_data.xlsx')
data.head()

Unnamed: 0,data,expected_result
0,the present invention relates to a method for ...,"[('automatically', 'generate', 'vocabulary', '..."
1,"the present disclosure is directed to methods,...","[('provide', 'different', 'way', 'to', 'prepar..."
2,an anti-theft method for an oil tank truck bas...,"[('acquire', 'scene', 'picture'), ('predict', ..."
3,the invention provides an artificial intellige...,"[('categorize', 'event'), ('categorize', 'even..."
4,systems and methods for using machine learning...,"[('predict', 'risk'), ('access', 'a', 'set', '..."


### Preprocessing

In [None]:
# check missing value

print(data.isna().sum())

data               0
expected_result    0
dtype: int64


In [None]:
# # remove missing value (tidak perlu karena kolom title and abstract tidak memiliki missing value)

# data.dropna(inplace=True)
# data.head()

In [None]:
# lowering text
data['data']= data['data'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['data'].head()

0    the present invention relates to a method for ...
1    the present disclosure is directed to methods,...
2    an anti-theft method for an oil tank truck bas...
3    the invention provides an artificial intellige...
4    systems and methods for using machine learning...
Name: data, dtype: object

In [None]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove punctuation and stopword
def tokenize_and_remove_punctuation(text):
    # Define stopword
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?;])\s+', text)

    # Tokenize each word in each sentence, remove punctuation, and remove stopword
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Remove stopword
        tokens = [token for token in tokens if token not in stop_words]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

# Tokenize each sentence and remove punctuation & stopword
data['tokenized'] = data['data'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
data.head()

Unnamed: 0,data,expected_result,tokenized
0,the present invention relates to a method for ...,"[('automatically', 'generate', 'vocabulary', '...","[[present, invention, relates, method, automat..."
1,"the present disclosure is directed to methods,...","[('provide', 'different', 'way', 'to', 'prepar...","[[present, disclosure, directed, methods, comp..."
2,an anti-theft method for an oil tank truck bas...,"[('acquire', 'scene', 'picture'), ('predict', ...","[[anti-theft, method, oil, tank, truck, based,..."
3,the invention provides an artificial intellige...,"[('categorize', 'event'), ('categorize', 'even...","[[invention, provides, artificial, intelligenc..."
4,systems and methods for using machine learning...,"[('predict', 'risk'), ('access', 'a', 'set', '...","[[systems, methods, using, machine, learning, ..."


In [None]:
# # Function to perform POS tagging on each token using NLTK
# def pos_tag_tokens(tokenized_sentences):
#     pos_tagged_sentences = []
#     for sentence_tokens in tokenized_sentences:
#         pos_tags = pos_tag(sentence_tokens)
#         pos_tagged_sentences.append(pos_tags)
#     return pos_tagged_sentences

# # Perform POS tagging on tokenized sentences
# data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# # Print the updated DataFrame
# data.head()

In [None]:
import spacy

# Load the spaCy model with Penn Treebank tag set
nlp = spacy.load("en_core_web_sm")

# Function to perform POS tagging on each token using spaCy
def pos_tag_spacy(text):
    doc = nlp(" ".join(text))
    return [(token.text, token.tag_) for token in doc]

data['pos_tagged_spacy'] = data['tokenized'].apply(lambda x: [pos_tag_spacy(sublist) for sublist in x])
data.head()


Unnamed: 0,data,expected_result,tokenized,pos_tagged_spacy
0,the present invention relates to a method for ...,"[('automatically', 'generate', 'vocabulary', '...","[[present, invention, relates, method, automat...","[[(present, JJ), (invention, NN), (relates, VB..."
1,"the present disclosure is directed to methods,...","[('provide', 'different', 'way', 'to', 'prepar...","[[present, disclosure, directed, methods, comp...","[[(present, JJ), (disclosure, NN), (directed, ..."
2,an anti-theft method for an oil tank truck bas...,"[('acquire', 'scene', 'picture'), ('predict', ...","[[anti-theft, method, oil, tank, truck, based,...","[[(anti, JJ), (-, JJ), (theft, JJ), (method, N..."
3,the invention provides an artificial intellige...,"[('categorize', 'event'), ('categorize', 'even...","[[invention, provides, artificial, intelligenc...","[[(invention, NN), (provides, VBZ), (artificia..."
4,systems and methods for using machine learning...,"[('predict', 'risk'), ('access', 'a', 'set', '...","[[systems, methods, using, machine, learning, ...","[[(systems, NNS), (methods, NNS), (using, VBG)..."


In [None]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged_spacy'].apply(lemmatize_tokens)

# Print the updated DataFrame
data.head()

Unnamed: 0,data,expected_result,tokenized,pos_tagged_spacy,lemmatized
0,the present invention relates to a method for ...,"[('automatically', 'generate', 'vocabulary', '...","[[present, invention, relates, method, automat...","[[(present, JJ), (invention, NN), (relates, VB...","[[(present, JJ), (invention, NN), (relate, VBZ..."
1,"the present disclosure is directed to methods,...","[('provide', 'different', 'way', 'to', 'prepar...","[[present, disclosure, directed, methods, comp...","[[(present, JJ), (disclosure, NN), (directed, ...","[[(present, JJ), (disclosure, NN), (direct, VB..."
2,an anti-theft method for an oil tank truck bas...,"[('acquire', 'scene', 'picture'), ('predict', ...","[[anti-theft, method, oil, tank, truck, based,...","[[(anti, JJ), (-, JJ), (theft, JJ), (method, N...","[[(anti, JJ), (-, JJ), (theft, JJ), (method, N..."
3,the invention provides an artificial intellige...,"[('categorize', 'event'), ('categorize', 'even...","[[invention, provides, artificial, intelligenc...","[[(invention, NN), (provides, VBZ), (artificia...","[[(invention, NN), (provide, VBZ), (artificial..."
4,systems and methods for using machine learning...,"[('predict', 'risk'), ('access', 'a', 'set', '...","[[systems, methods, using, machine, learning, ...","[[(systems, NNS), (methods, NNS), (using, VBG)...","[[(system, NNS), (method, NNS), (use, VBG), (m..."


### POS Chunk

In [None]:
# define pattern
pattern_siahaan = r'''Chunk: {<VB\w*>+<.|IN|CC|PRP\w*>*<DT>*<NN\w*|RB>+}'''
pattern_webb = r'''Chunk: {<VB\w*><NN\w*>}'''
pattern_johann = r'''Chunk: {<VB\w*><NN\w*>|<VB\w*><PRP\w*><NN\w*>|<VB\w*><NN\w*><NN\w*>|<VB\w*><JJ\w*><NN\w*>|<VB\w*><DT><NN\w*>|<VB\w*><NN\w*><IN><NN\w*>|<VB\w*><IN><JJ\w*><NN\w*>|<VB\w*><PRP\w*><JJ\w*><NN\w*>}'''
new_pattern = r'''Chunk: {<VB\w*>+<PRP\w*|JJ\w*|DT|IN|CC|.>*<NN\w*>+}'''

def extract(tagged_texts, pattern):
    chunk_parser = RegexpParser(pattern)
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []

    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))

    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['pattern_webb'] = data['lemmatized'].apply(lambda x: extract(x, pattern_webb))
data['pattern_siahaan'] = data['lemmatized'].apply(lambda x: extract(x, pattern_siahaan))
data['pattern_johann'] = data['lemmatized'].apply(lambda x: extract(x, pattern_johann))
data['new_pattern'] = data['lemmatized'].apply(lambda x: extract(x, new_pattern))

# Print the resulting DataFrame with chunked data
data['pattern_webb']

0     [(relate, method), (base, word), (mean, clarif...
1     [(direct, method), (prepare, food), (provide, ...
2     [(base, machine), (follow, step), (acquire, sc...
3     [(ai, system), (categorize, event), (say, ai),...
4     [(use, machine), (predict, risk), (pose, patie...
5     [(require, lead), (analyze, conversation), (co...
6     [(relate, apparatus), (provide, surface), (pro...
7     [(provide, device), (recommend, content), (fol...
8     [(ai, system), (ai, system), (include, compute...
9     [(provide, method), (follow, step), (extract, ...
10    [(manage, director), (formulate, review), (sup...
11    [(oversee, mining), (quarry, operation), (supe...
12    [(advise, government), (oversee, interpretatio...
13              [(obtain, information), (cause, death)]
14    [(study, ass), (develop, plan), (protect, cons...
15                               [(construct, feature)]
16    [(physicists, astronomer), (conduct, research)...
17    [(disorder, teeth), (accord, care), (estab

In [None]:
# Remove common mistake
common_mistake = {'ai', 'artificial', 'intelligence', 'system', 'model', 'method', 'invention'}

def filter_tuples(tuple_list):
    return [tup for tup in tuple_list if not any(keyword in tup for keyword in common_mistake)]

# Apply the function to the DataFrame column
data['pattern_webb'] = data['pattern_webb'].apply(filter_tuples)
data['pattern_johann'] = data['pattern_johann'].apply(filter_tuples)
data['pattern_siahaan'] = data['pattern_siahaan'].apply(filter_tuples)
data['new_pattern'] = data['new_pattern'].apply(filter_tuples)

### Export Result

In [None]:
# hasile

patent_task = data[["data", "pattern_webb", "pattern_siahaan", "pattern_johann", "new_pattern", "expected_result"]]
data.to_excel('keyword_extraction_eval_data (6).xlsx', index=False)