### Import Libraries

In [1]:
import re
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Keyword Extraction by NLP

### Preprocessing

In [2]:
# read data

data = pd.read_excel('data/keyword_extraction_eval_data.xlsx')
data.head()

Unnamed: 0,data
0,The present invention relates to a method for ...
1,"The present disclosure is directed to methods,..."
2,An anti-theft method for an oil tank truck bas...
3,The invention provides an artificial intellige...
4,Systems and methods for using machine learning...


In [3]:
# check missing value

print(data.isna().sum())

data    0
dtype: int64


In [4]:
# # remove missing value (tidak perlu karena kolom title and abstract tidak memiliki missing value)

# data.dropna(inplace=True)
# data.head()
data

Unnamed: 0,data
0,The present invention relates to a method for ...
1,"The present disclosure is directed to methods,..."
2,An anti-theft method for an oil tank truck bas...
3,The invention provides an artificial intellige...
4,Systems and methods for using machine learning...
5,The present invention relates to an artificial...
6,The present invention relates to an apparatus ...
7,Provided are a device for updating a first art...
8,An artificial intelligence (AI) system for det...
9,The invention provides a method and a system f...


In [5]:
# lowering text
data['lowered']= data['data'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['lowered'].head()

0    the present invention relates to a method for ...
1    the present disclosure is directed to methods,...
2    an anti-theft method for an oil tank truck bas...
3    the invention provides an artificial intellige...
4    systems and methods for using machine learning...
Name: lowered, dtype: object

In [6]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove punctuation and sopword
def tokenize_and_remove_punctuation(text):
    # Define stopword
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?,;])\s+', text)
    
    # Tokenize each word in each sentence, remove punctuation, and remove stopword
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Remove stopword
        tokens = [token for token in tokens if token not in stop_words]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

# Tokenize each sentence and remove punctuation & stopword
data['tokenized'] = data['lowered'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
print(data['tokenized'][3])

[['invention', 'provides', 'artificial', 'intelligence', 'ai', 'system', 'categorizing', 'events'], ['said', 'ai', 'system', 'comprising', 'first', 'state', 'second', 'state'], ['wherein', 'said', 'ai', 'system', 'first', 'state', 'categorizing', 'events', 'first', 'category', 'type'], ['upon', 'categorizing', 'first', 'event', 'predefined', 'category', 'said', 'first', 'category', 'type'], ['said', 'ai', 'system', 'set', 'said', 'second', 'state'], ['said', 'second', 'state', 'said', 'ai', 'system', 'set', 'categorizing', 'subsequent', 'events', 'second', 'category', 'type']]


### POS Tag

In [7]:
# Function to perform POS tagging on each token
def pos_tag_tokens(tokenized_sentences):
    pos_tagged_sentences = []
    for sentence_tokens in tokenized_sentences:
        pos_tags = pos_tag(sentence_tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

# Perform POS tagging on tokenized sentences
data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# Print the updated DataFrame
print(data['pos_tagged'][3])

[[('invention', 'NN'), ('provides', 'VBZ'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('ai', 'NN'), ('system', 'NN'), ('categorizing', 'VBG'), ('events', 'NNS')], [('said', 'VBD'), ('ai', 'NN'), ('system', 'NN'), ('comprising', 'VBG'), ('first', 'JJ'), ('state', 'NN'), ('second', 'JJ'), ('state', 'NN')], [('wherein', 'NN'), ('said', 'VBD'), ('ai', 'NN'), ('system', 'NN'), ('first', 'JJ'), ('state', 'NN'), ('categorizing', 'VBG'), ('events', 'NNS'), ('first', 'JJ'), ('category', 'NN'), ('type', 'NN')], [('upon', 'IN'), ('categorizing', 'VBG'), ('first', 'JJ'), ('event', 'NN'), ('predefined', 'VBD'), ('category', 'NN'), ('said', 'VBD'), ('first', 'JJ'), ('category', 'NN'), ('type', 'NN')], [('said', 'VBD'), ('ai', 'NN'), ('system', 'NN'), ('set', 'VBN'), ('said', 'VBD'), ('second', 'JJ'), ('state', 'NN')], [('said', 'VBD'), ('second', 'JJ'), ('state', 'NN'), ('said', 'VBD'), ('ai', 'JJ'), ('system', 'NN'), ('set', 'VBN'), ('categorizing', 'VBG'), ('subsequent', 'JJ'), ('events', 'NNS

In [8]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged'].apply(lemmatize_tokens)

# Print the updated DataFrame
print(data['lemmatized'][3])

[[('invention', 'NN'), ('provide', 'VBZ'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('ai', 'NN'), ('system', 'NN'), ('categorize', 'VBG'), ('event', 'NNS')], [('say', 'VBD'), ('ai', 'NN'), ('system', 'NN'), ('comprise', 'VBG'), ('first', 'JJ'), ('state', 'NN'), ('second', 'JJ'), ('state', 'NN')], [('wherein', 'NN'), ('say', 'VBD'), ('ai', 'NN'), ('system', 'NN'), ('first', 'JJ'), ('state', 'NN'), ('categorize', 'VBG'), ('event', 'NNS'), ('first', 'JJ'), ('category', 'NN'), ('type', 'NN')], [('upon', 'IN'), ('categorize', 'VBG'), ('first', 'JJ'), ('event', 'NN'), ('predefined', 'VBD'), ('category', 'NN'), ('say', 'VBD'), ('first', 'JJ'), ('category', 'NN'), ('type', 'NN')], [('say', 'VBD'), ('ai', 'NN'), ('system', 'NN'), ('set', 'VBN'), ('say', 'VBD'), ('second', 'JJ'), ('state', 'NN')], [('say', 'VBD'), ('second', 'JJ'), ('state', 'NN'), ('say', 'VBD'), ('ai', 'JJ'), ('system', 'NN'), ('set', 'VBN'), ('categorize', 'VBG'), ('subsequent', 'JJ'), ('event', 'NNS'), ('second', 'JJ'),

### POS Chunk

In [9]:
# POS chunk new pattern (Pak Indra)

# define pattern
pattern_raharjana = r'''Chunk: {<VB\w*>+<.|IN|CC|PRP\w*>*<DT>*<NN\w*|RB>+}'''
pattern_webb = r'''Chunk: {<VB\w*><NN\w*>}'''
pattern_johann = r'''Chunk: {<VB\w*><NN\w*>|<VB\w*><PRP\w*><NN\w*>|<VB\w*><NN\w*><NN\w*>|<VB\w*><JJ\w*><NN\w*>|<VB\w*><DT><NN\w*>|<VB\w*><NN\w*><IN><NN\w*>|<VB\w*><IN><JJ\w*><NN\w*>|<VB\w*><PRP\w*><JJ\w*><NN\w*>}'''

def extract(tagged_texts, pattern):
    chunk_parser = RegexpParser(pattern)
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['pattern_webb'] = data['lemmatized'].apply(lambda x: extract(x, pattern_webb))
data['pattern_raharjana'] = data['lemmatized'].apply(lambda x: extract(x, pattern_raharjana))
data['pattern_johann'] = data['lemmatized'].apply(lambda x: extract(x, pattern_johann))

# Print the resulting DataFrame with chunked data
data['pattern_webb']



0     [(give, problem), (correspond, word), (inputti...
1     [(direct, method), (prepare, food), (prepare, ...
2     [(base, machine), (learn, comprises), (follow,...
3     [(categorize, event), (say, ai), (say, ai), (c...
4     [(use, machine), (predict, risk), (pose, patie...
5     [(analyze, conversation), (collect, data), (an...
6     [(provide, surface), (mold, condition), (provi...
7     [(provide, device), (ai, model), (follow, step...
8     [(include, computer), (run, computer), (label,...
9     [(follow, step), (extract, target), (improve, ...
10    [(manage, director), (formulate, review), (ent...
11       [(quarry, operation), (supervise, coordinate)]
12      [(advise, government), (establish, government)]
13                              [(obtain, information)]
14               [(develop, plan), (minimize, prevent)]
15                               [(construct, feature)]
16    [(conduct, research), (improve, develop), (con...
17               [(diseases, disorder), (accord,

### Export Result

In [13]:
# hasile

data = data[["data", "pattern_webb", "pattern_raharjana", "pattern_johann"]]
data.to_excel('data/keyword_extraction_eval_data.xlsx', index=False)

## Add Expected Result from Annotators Label

In [15]:
data = pd.read_excel('data/keyword_extraction_eval_data.xlsx')
data['pattern_webb'] = data['pattern_webb'].apply(lambda x:list(eval(str(x))))
data['pattern_raharjana'] = data['pattern_raharjana'].apply(lambda x:list(eval(str(x))))
data['pattern_johann'] = data['pattern_johann'].apply(lambda x:list(eval(str(x))))
data.head()

Unnamed: 0,data,pattern_webb,pattern_raharjana,pattern_johann
0,The present invention relates to a method for ...,"[(give, problem), (correspond, word), (inputti...","[(give, problem, opportunity, deeply), (corres...","[(generate, vocabulary, problem), (accord, pre..."
1,"The present disclosure is directed to methods,...","[(direct, method), (prepare, food), (prepare, ...","[(direct, method), (prepare, food, dish), (pre...","[(direct, method), (include, robotic, kitchen)..."
2,An anti-theft method for an oil tank truck bas...,"[(base, machine), (learn, comprises), (follow,...","[(base, machine), (learn, comprises), (follow,...","[(base, machine), (learn, comprises), (follow,..."
3,The invention provides an artificial intellige...,"[(categorize, event), (say, ai), (say, ai), (c...","[(categorize, event), (say, ai, system), (say,...","[(provide, artificial, intelligence), (categor..."
4,Systems and methods for using machine learning...,"[(use, machine), (predict, risk), (pose, patie...","[(use, machine, learning, model), (predict, ri...","[(use, machine), (predict, risk), (pose, patie..."


In [18]:
annotator_data = pd.read_excel('data/keyword_extraction_eval_count.xlsx')
annotator_data = annotator_data[annotator_data['labeling'] >= 3]
annotator_data.head()

Unnamed: 0,index,keywords,Responden 1,Responden 2,Responden 3,Responden 4,Responden 5,labeling,not_labeling
0,0,automatically generating vocabulary problems,1.0,1.0,1.0,1.0,1.0,5,0
1,0,generate potential answers,1.0,1.0,1.0,1.0,1.0,5,0
5,0,automatically generating vocabulary problems,1.0,1.0,1.0,1.0,1.0,5,0
7,0,generate potential answers of high difficulty ...,1.0,1.0,1.0,1.0,1.0,5,0
8,0,inputting target words,0.0,1.0,1.0,0.0,1.0,3,2


In [24]:
grouped = annotator_data.groupby('index')['keywords'].agg(list)
grouped = grouped.apply(lambda x: [tuple(word.split()) for word in x])
data['expected_result'] = grouped
data.head()

Unnamed: 0,data,pattern_webb,pattern_raharjana,pattern_johann,expected_result
0,The present invention relates to a method for ...,"[(give, problem), (correspond, word), (inputti...","[(give, problem, opportunity, deeply), (corres...","[(generate, vocabulary, problem), (accord, pre...","[(automatically, generating, vocabulary, probl..."
1,"The present disclosure is directed to methods,...","[(direct, method), (prepare, food), (prepare, ...","[(direct, method), (prepare, food, dish), (pre...","[(direct, method), (include, robotic, kitchen)...","[(provide, different, ways, to, prepare, food,..."
2,An anti-theft method for an oil tank truck bas...,"[(base, machine), (learn, comprises), (follow,...","[(base, machine), (learn, comprises), (follow,...","[(base, machine), (learn, comprises), (follow,...","[(acquiring, scene, pictures), (predicting, wh..."
3,The invention provides an artificial intellige...,"[(categorize, event), (say, ai), (say, ai), (c...","[(categorize, event), (say, ai, system), (say,...","[(provide, artificial, intelligence), (categor...","[(categorizing, events), (categorizing, events..."
4,Systems and methods for using machine learning...,"[(use, machine), (predict, risk), (pose, patie...","[(use, machine, learning, model), (predict, ri...","[(use, machine), (predict, risk), (pose, patie...","[(predict, risks), (access, a, set, of, real-w..."


In [29]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Define preprocessing function
def preprocess(text):
    preprocessed_tuples = []
    for tuple_ in text:
        preprocessed_words = []
        for word in tuple_:
            # Lowercasing
            word_lower = word.lower()
            
            # Removing punctuation
            word_no_punct = ''.join([char for char in word_lower if char not in string.punctuation])
            
            # Tokenization
            words = word_tokenize(word_no_punct)
            
            # Removing stopwords and Lemmatization
            words_no_stopwords = []
            for w in words:
                # Get POS tag for the word
                pos_tag = nltk.pos_tag([w])[0][1][0].lower()
                # Lemmatize based on the POS tag
                if pos_tag in ['a', 'n', 'r', 'v']:  # Adjective, noun, adverb, verb
                    lemma = lemmatizer.lemmatize(w, pos=pos_tag)
                else:
                    lemma = w  # Keep the word unchanged if not recognized as one of the above POS tags
                words_no_stopwords.append(lemma)
            
            preprocessed_words.extend(words_no_stopwords)
        
        preprocessed_tuples.append(tuple(preprocessed_words))
    
    return preprocessed_tuples

# Apply preprocessing function to the column
data['expected_result'] = data['expected_result'].apply(preprocess)

print(data['expected_result'])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0     [(automatically, generate, vocabulary, problem...
1     [(provide, different, way, to, prepare, food, ...
2     [(acquire, scene, picture), (predict, whether)...
3     [(categorize, event), (categorize, event), (ca...
4     [(predict, risk), (access, a, set, of, realwor...
5     [(collect, audio, and, video, data), (output, ...
6     [(predict, and, provide, the, surface, roughne...
7     [(recommend, content), (identify,), (transmit,...
8     [(determine, to, which, social, group, a, pers...
9     [(detect, and, position, a, move, object, unde...
10    [(formulate, and, review), (coordinate,), (eva...
11    [(directly, supervise, and, coordinate, the, a...
12    [(advise, government, on, policy, matter), (ov...
13    [(perform, clinical, test, on, specimen), (obt...
14    [(study, and, ass), (develop, plan, and, solut...
15    [(determine, the, exact, position), (prepare, ...
16    [(conduct, research), (improve, or, develop, c...
17    [(provide, basic, dental, care, service), 

In [25]:
# hasile

data.to_excel('data/keyword_extraction_eval_data.xlsx', index=False)