### Import Libraries

In [31]:
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.chunk import RegexpParser
from nltk import sent_tokenize, word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Preprocessing

In [32]:
# read data

data = pd.read_csv('data/patent.csv')
data = data[["title", "abstract_localized.text"]]
data.head()

Unnamed: 0,title,abstract_localized.text
0,Fire detection system based on artificial inte...,The present disclosure relates to a fire detec...
1,,The present disclosure relates to a fire detec...
2,Embedding artificial intelligence for balancin...,Responsive to a CPU load of a specific access ...
3,,Responsive to a CPU load of a specific access ...
4,Optimization techniques for artificial intelli...,"Methods, apparatuses and computer readable med..."


In [33]:
# add . in the end of title

def add_period(text):
    if pd.notna(text):
        return text + ". "
    else:
        return text

# Apply the function to the 'title' column
data['title'] = data['title'].apply(add_period)

In [34]:
# to repair json data, wrap with [all data] and add coma at the end of each row, like try.json
# import json

# with open('data/try.json', 'r') as json_file:
#     data_json = json.load(json_file)

In [35]:
data['title_and_abstract'] = data['title'].fillna('') + data['abstract_localized.text']
data.head()

Unnamed: 0,title,abstract_localized.text,title_and_abstract
0,Fire detection system based on artificial inte...,The present disclosure relates to a fire detec...,Fire detection system based on artificial inte...
1,,The present disclosure relates to a fire detec...,The present disclosure relates to a fire detec...
2,Embedding artificial intelligence for balancin...,Responsive to a CPU load of a specific access ...,Embedding artificial intelligence for balancin...
3,,Responsive to a CPU load of a specific access ...,Responsive to a CPU load of a specific access ...
4,Optimization techniques for artificial intelli...,"Methods, apparatuses and computer readable med...",Optimization techniques for artificial intelli...


In [36]:
# check missing value

print(data.isna().sum())

title                      234
abstract_localized.text      0
title_and_abstract           0
dtype: int64


In [37]:
# # remove missing value

# data.dropna(inplace=True)
# data.head()

In [38]:
# # remove punctuation

# def remove_punctuation(text):
#     punctuationfree="".join([i for i in text if i not in string.punctuation])
#     return punctuationfree

# data['cleaned'] = data['title_and_abstract'].apply(lambda x:remove_punctuation(str(x)))
# data['cleaned'].head()

In [39]:
# lowering text

data['title_and_abstract']= data['title_and_abstract'].apply(lambda x: x.lower())
data['title_and_abstract'].head()

0    fire detection system based on artificial inte...
1    the present disclosure relates to a fire detec...
2    embedding artificial intelligence for balancin...
3    responsive to a cpu load of a specific access ...
4    optimization techniques for artificial intelli...
Name: title_and_abstract, dtype: object

In [40]:
# # tokenization

# data['tokenized'] = data['title_and_abstract'].apply(nltk.word_tokenize)
# data['clean_tokenized'] = data['cleaned'].apply(nltk.word_tokenize)
# data['tokenized'].head

In [41]:
# # Tokenize each sentence
# data['sentences'] = data['title_and_abstract'].apply(sent_tokenize)

# # Tokenize each word within each sentence
# data['tokenized'] = data['sentences'].apply(lambda sentences: [word_tokenize(sentence) for sentence in sentences])

# # Print the updated DataFrame
# print(data[['sentences', 'tokenized']])

In [42]:
# Function to remove punctuation and tokenize each sentence
def tokenize_and_remove_punctuation(text):
    # Tokenize each sentence
    sentences = sent_tokenize(text)
    
    # Remove punctuation and tokenize each word in each sentence
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        tokenized_sentences.append(tokens)
    
    return tokenized_sentences

# Tokenize each sentence and remove punctuation
data['tokenized'] = data['title_and_abstract'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
print(data['tokenized'][0])

[['fire', 'detection', 'system', 'based', 'on', 'artificial', 'intelligence', 'and', 'fire', 'detection', 'method', 'based', 'on', 'artificial', 'intelligence'], ['the', 'present', 'disclosure', 'relates', 'to', 'a', 'fire', 'detection', 'system', 'based', 'on', 'ai', 'and', 'a', 'fire', 'detection', 'method', 'based', 'on', 'ai', 'which', 'detects', 'fire', 'by', 'using', 'deep', 'learning'], ['the', 'fire', 'detection', 'system', 'includes', 'a', 'camera', 'part', 'generating', 'video', 'images', 'an', 'object', 'extraction', 'part', 'extracting', 'a', 'motion', 'object', 'from', 'the', 'video', 'images', 'a', 'video', 'image', 'conversion', 'part', 'generating', 'a', 'first', 'background', 'removal', 'image', 'an', 'image', 'division', 'part', 'dividing', 'the', 'first', 'background', 'removal', 'image', 'with', 'a', 'plurality', 'of', 'division', 'lines', 'and', 'an', 'analyzing', 'part', 'generating', 'an', 'abnormal', 'signal', 'and', 'a', 'normal', 'signal']]


In [43]:
# # remove stopwords

# stopwords = nltk.corpus.stopwords.words('english')

# def remove_stopwords(text):
#     output= [i for i in text if i not in stopwords]
#     return output

# data['clean_tokenized'] = data['clean_tokenized'].apply(lambda x:remove_stopwords(x))
# data['clean_tokenized'].head

### POS Tag

In [44]:
# data['tagged'] = data['tokenized'].apply(nltk.pos_tag)
# data['clean_tagged'] = data['clean_tokenized'].apply(nltk.pos_tag)
# data['tagged'].head()

In [45]:
# Function to perform POS tagging on each token
def pos_tag_tokens(tokenized_sentences):
    pos_tagged_sentences = []
    for sentence_tokens in tokenized_sentences:
        pos_tags = pos_tag(sentence_tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

# Perform POS tagging on tokenized sentences
data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# Print the updated DataFrame
print(data['pos_tagged'][0])

[[('fire', 'NN'), ('detection', 'NN'), ('system', 'NN'), ('based', 'VBN'), ('on', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('and', 'CC'), ('fire', 'NN'), ('detection', 'NN'), ('method', 'NN'), ('based', 'VBN'), ('on', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN')], [('the', 'DT'), ('present', 'JJ'), ('disclosure', 'NN'), ('relates', 'VBZ'), ('to', 'TO'), ('a', 'DT'), ('fire', 'NN'), ('detection', 'NN'), ('system', 'NN'), ('based', 'VBN'), ('on', 'IN'), ('ai', 'NN'), ('and', 'CC'), ('a', 'DT'), ('fire', 'NN'), ('detection', 'NN'), ('method', 'NN'), ('based', 'VBN'), ('on', 'IN'), ('ai', 'NN'), ('which', 'WDT'), ('detects', 'VBZ'), ('fire', 'NN'), ('by', 'IN'), ('using', 'VBG'), ('deep', 'JJ'), ('learning', 'NN')], [('the', 'DT'), ('fire', 'NN'), ('detection', 'NN'), ('system', 'NN'), ('includes', 'VBZ'), ('a', 'DT'), ('camera', 'NN'), ('part', 'NN'), ('generating', 'VBG'), ('video', 'NN'), ('images', 'NNS'), ('an', 'DT'), ('object', 'JJ'), ('extraction', 'NN'), ('part'

In [46]:
# Function to lemmatize tokens while preserving the POS tags
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged'].apply(lemmatize_tokens)
print(data['lemmatized'][0])

[[('fire', 'NN'), ('detection', 'NN'), ('system', 'NN'), ('base', 'VBN'), ('on', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('and', 'CC'), ('fire', 'NN'), ('detection', 'NN'), ('method', 'NN'), ('base', 'VBN'), ('on', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN')], [('the', 'DT'), ('present', 'JJ'), ('disclosure', 'NN'), ('relate', 'VBZ'), ('to', 'TO'), ('a', 'DT'), ('fire', 'NN'), ('detection', 'NN'), ('system', 'NN'), ('base', 'VBN'), ('on', 'IN'), ('ai', 'NN'), ('and', 'CC'), ('a', 'DT'), ('fire', 'NN'), ('detection', 'NN'), ('method', 'NN'), ('base', 'VBN'), ('on', 'IN'), ('ai', 'NN'), ('which', 'WDT'), ('detect', 'VBZ'), ('fire', 'NN'), ('by', 'IN'), ('use', 'VBG'), ('deep', 'JJ'), ('learning', 'NN')], [('the', 'DT'), ('fire', 'NN'), ('detection', 'NN'), ('system', 'NN'), ('include', 'VBZ'), ('a', 'DT'), ('camera', 'NN'), ('part', 'NN'), ('generate', 'VBG'), ('video', 'NN'), ('image', 'NNS'), ('an', 'DT'), ('object', 'JJ'), ('extraction', 'NN'), ('part', 'NN'), ('e

In [47]:
# lemmatizer = WordNetLemmatizer()

# def lemmatize_tuples(tuples):
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_tuples = []
#     for tup in tuples:
#         lemmatized_words = [lemmatizer.lemmatize(word) for word in tup]
#         lemmatized_tuples.append(tuple(lemmatized_words))
#     return lemmatized_tuples

# data['lemmatized'] = data['tagged'].apply(lemmatize_tuples)
# data['clean_lemmatized'] = data['clean_tagged'].apply(lemmatize_tuples)
# data['lemmatized'].head()

In [48]:
# def get_wordnet_pos(treebank_tag):
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN  # Default to noun if POS tag not recognized

# lemmatizer = WordNetLemmatizer()

# data['lemmatized'] = data['tagged'].apply(
#     lambda tagged_sentences: [(lemmatizer.lemmatize(word, get_wordnet_pos(tag)), tag) for word, tag in tagged_sentences])
# print(data['lemmatized'])

In [49]:
# import pandas as pd
# import spacy

# # Load spaCy language model
# nlp = spacy.load('en_core_web_sm')

# # Function to lemmatize a list of tagged tokens
# def lemmatize_tagged_sentences(tagged_sentences):
#     lemmatized_sentences = []
#     for token in tagged_sentences:
#         word, pos = token
#         lemmatized_word = nlp(word)[0].lemma_
#         lemmatized_sentences.append((lemmatized_word, pos))
#     return lemmatized_sentences

# # Lemmatize the tagged sentences in each row of the DataFrame
# data['lemmatized'] = data['tagged'].apply(lemmatize_tagged_sentences)

# print(data['lemmatized'])


### POS Chunk

In [50]:
# # lemmatization

# lemmatizer = WordNetLemmatizer()

# def lemmatize_word(word, tag):
#     if tag.startswith('V'):
#         return lemmatizer.lemmatize(word, 'v')  # Verb
#     elif tag.startswith('N'):
#         return lemmatizer.lemmatize(word, 'n')  # Noun
#     else:
#         return word

In [51]:
# # pos chunk lama

# def filter_verb_noun_pairs(tagged_sentence):
#     verb_noun_pairs = []
#     for i in range(len(tagged_sentence) - 1):
#         word, tag = tagged_sentence[i]
#         next_word, next_tag = tagged_sentence[i + 1]
#         if tag.startswith('VB') and next_tag.startswith('NN'):
#             verb = lemmatize_word(word, tag)
#             noun = lemmatize_word(next_word, next_tag)
#             verb_noun_pairs.append((verb, noun))
#     return verb_noun_pairs

# data['chunked_verb_noun'] = data['clean_lemmatized'].apply(filter_verb_noun_pairs)

In [52]:
# pos chunk lama

def extract_verb_noun_pairs(tagged_texts):
    chunks = []
    for tagged_sentence in tagged_texts:
        result = filter_verb_noun_pairs(tagged_sentence)
        chunks.append(result)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def filter_verb_noun_pairs(tagged_sentence):
    verb_noun_pairs = []
    for i in range(len(tagged_sentence) - 1):
        word, tag = tagged_sentence[i]
        next_word, next_tag = tagged_sentence[i + 1]
        if tag.startswith('VB') and next_tag.startswith('NN'):
            verb = word
            noun = next_word
            verb_noun_pairs.append((verb, noun))
    return verb_noun_pairs

data['chunked_verb_noun'] = data['lemmatized'].apply(extract_verb_noun_pairs)
data['chunked_verb_noun']

0      [(detect, fire), (generate, video), (analyze, ...
1      [(detect, fire), (generate, video), (analyze, ...
2      [(balance, access), (process, load), (enable, ...
3                [(threshold, value), (determine, rssi)]
4      [(receive, annotation), (determine, document),...
                             ...                        
495                [(learn, model), (further, comprise)]
496    [(optimize, hyperparameter), (optimize, hyperp...
497    [(optimize, hyperparameter), (ai, model), (eva...
498    [(provide, system), (provide, ai), (compute, e...
499    [(provide, system), (provide, ai), (compute, e...
Name: chunked_verb_noun, Length: 500, dtype: object

In [53]:
# # pos chunk baru

# pattern = r'''Chunk: {(<ADJ|VB\w*>+<.|RP|IN|CC|PRP\w*>*<DET>*<NN\w*|RB\w*>+)+}'''
# chunk_parser = RegexpParser(pattern)

# def extract(tagged_text):
#     tree = chunk_parser.parse(tagged_text)
#     chunks = extract_chunks(tree)
#     return chunks

# def extract_chunks(tree):
#     chunks = []
    
#     if isinstance(tree, nltk.Tree):
#         if tree.label() != 'S':  # Exclude sentence-level chunks if any
#             chunks.append(tree)
#         for subtree in tree:
#             chunks.extend(extract_chunks(subtree))
    
#     return chunks

# def trees_to_tuples(tree_list):
#     tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
#     return tuple_list

# data['chunked'] = data['lemmatized'].apply(extract)
# data['chunked'] = data['chunked'].apply(trees_to_tuples)

# data['clean_chunked'] = data['clean_lemmatized'].apply(extract)
# data['clean_chunked'] = data['clean_chunked'].apply(trees_to_tuples)

In [54]:
# Define the chunking pattern
pattern = r'''Chunk: {(<ADJ|VB\w*>+<\.|RP|IN|CC|PRP\w*>*<DT>*<NN\w*|RB\w*>+)+}'''
chunk_parser = RegexpParser(pattern)

def extract(tagged_texts):
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['chunked'] = data['lemmatized'].apply(extract)
# data['chunked'] = data['chunked'].apply(trees_to_tuples)

# Print the resulting DataFrame with chunked data
data['chunked']

0      [(base, on, ai), (base, on, ai), (detect, fire...
1      [(base, on, ai), (base, on, ai), (detect, fire...
2      [(balance, access, point, process, load), (ena...
3      [(threshold, value), (determine, rssi, value),...
4      [(comprise, select, from, a, pool), (be, annot...
                             ...                        
495    [(comprise, receive, from, a, client, device),...
496    [(optimize, hyperparameter, tuples), (optimize...
497    [(optimize, hyperparameter, tuples), (ai, mode...
498    [(be, provide, system), (provide, ai, system),...
499    [(be, provide, system), (provide, ai, system),...
Name: chunked, Length: 500, dtype: object

### Make the Result Better

In [55]:
# # lowering text

# data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())
# data['msg_lower'].head()

In [56]:
# def lowercase_tuples(tuples):
#     lowercased_tuples = []
#     for tup in tuples:
#         lowercased_words = tuple(word.lower() for word in tup)
#         lowercased_tuples.append(lowercased_words)
#     return lowercased_tuples

# data['chunked'] = data['chunked'].apply(lowercase_tuples)
# data['clean_chunked'] = data['clean_chunked'].apply(lowercase_tuples)

In [57]:
# def lowercase_lists(lists):
#     chunks = []
#     for list in lists:
#         result = lowercase_tuples(list)
#         chunks.append(result)
#     return chunks


# def lowercase_tuples(tuples):
#     lowercased_tuples = []
#     for tup in tuples:
#         lowercased_words = tuple(word.lower() for word in tup)
#         lowercased_tuples.append(lowercased_words)
#     return lowercased_tuples

# data['chunked'] = data['chunked'].apply(lowercase_tuples)
# data['chunked_verb_noun'] = data['chunked_verb_noun'].apply(lowercase_tuples)

In [58]:
# # remove stopwords

# stopwords = nltk.corpus.stopwords.words('english')

# def remove_stopwords(text):
#     output= [i for i in text if i not in stopwords]
#     return output

# data['no_stopwords']= data['msg_tokenized'].apply(lambda x:remove_stopwords(x))
# data['no_stopwords'].head

In [59]:
# # lemmatization

# lemmatizer = WordNetLemmatizer()

# def lemmatize_word(word, tag):
#     if tag.startswith('V'):
#         return lemmatizer.lemmatize(word, 'v')  # Verb
#     elif tag.startswith('N'):
#         return lemmatizer.lemmatize(word, 'n')  # Noun
#     else:
#         return word

### Export Result

In [60]:
# hasile

patent_task = data[["title_and_abstract", "chunked_verb_noun", "chunked"]]
patent_task.to_excel('export_result/chunk1.xlsx', index=False)