### Import Libraries

In [3]:
import re
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Preprocessing

In [4]:
# read data (data yang digunakan adalah data sample 500 baris agar pemrosesannya tidak lama)

data = pd.read_excel('data/new_paten.xlsx')
data = data[["title", "abstract"]]
data.head()

Unnamed: 0,title,abstract
0,A kind of artificial intelligence skin and its...,The invention discloses a kind of artificial i...
1,A kind of image-recognizing method based on na...,The invention discloses a kind of image-recogn...
2,The pedestrian detection method merged based o...,The present invention relates to a kind of ped...
3,A kind of Knowledge Base based on natural lang...,The invention discloses a kind of Knowledge Ba...
4,Text emotion analysis method and device,The invention discloses a kind of text emotion...


In [5]:
# add . in the end of title
def add_period(text):
    if pd.notna(text):
        return text + ". "
    else:
        return text

# Apply the function to the 'title' column
data['title'] = data['title'].apply(add_period)

# merge title and abstract column
data['title_and_abstract'] = data['title'].fillna('') + data['abstract']

# Print the updated DataFrame
data.head()

Unnamed: 0,title,abstract,title_and_abstract
0,A kind of artificial intelligence skin and its...,The invention discloses a kind of artificial i...,A kind of artificial intelligence skin and its...
1,A kind of image-recognizing method based on na...,The invention discloses a kind of image-recogn...,A kind of image-recognizing method based on na...
2,The pedestrian detection method merged based o...,The present invention relates to a kind of ped...,The pedestrian detection method merged based o...
3,A kind of Knowledge Base based on natural lang...,The invention discloses a kind of Knowledge Ba...,A kind of Knowledge Base based on natural lang...
4,Text emotion analysis method and device.,The invention discloses a kind of text emotion...,Text emotion analysis method and device. The i...


In [6]:
# check missing value

print(data.isna().sum())

title                 0
abstract              0
title_and_abstract    0
dtype: int64


In [7]:
# # remove missing value (tidak perlu karena kolom title and abstract tidak memiliki missing value)

# data.dropna(inplace=True)
# data.head()

In [8]:
# lowering text
data['title_and_abstract']= data['title_and_abstract'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['title_and_abstract'].head()

0    a kind of artificial intelligence skin and its...
1    a kind of image-recognizing method based on na...
2    the pedestrian detection method merged based o...
3    a kind of knowledge base based on natural lang...
4    text emotion analysis method and device. the i...
Name: title_and_abstract, dtype: object

In [9]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove stopwords
def tokenize_and_remove_punctuation(text):
    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?,;])\s+', text)
    
    # Remove punctuation and tokenize each word in each sentence
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        tokenized_sentences.append(tokens)
    
    return tokenized_sentences

# Tokenize each sentence and remove punctuation
data['tokenized'] = data['title_and_abstract'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
print(data['tokenized'][0])

[['a', 'kind', 'of', 'artificial', 'intelligence', 'skin', 'and', 'its', 'method', 'for', 'detection', 'humiture', 'and', 'pressure'], ['the', 'invention', 'discloses', 'a', 'kind', 'of', 'artificial', 'intelligence', 'skin'], ['it', 'includes', 'signal', 'acquisition', 'part', 'and', 'circuit', 'part'], ['signal', 'acquisition', 'part', 'point', 'includes', 'humidity', 'sensitive', 'material', 'layer'], ['first', 'electrode', 'layer'], ['insulating', 'layer'], ['the', 'second', 'electrode', 'lay'], ['thermistor', 'material', 'layer'], ['the', '3rd', 'electrode', 'layer'], ['piezoelectric', 'material', 'layer', 'and', 'the', '4th', 'electrode', 'layer', 'set', 'gradually', 'from', 'top', 'to', 'bottom'], ['and', 'circuit', 'part', 'includes', 'front-end', 'circuit', 'and', 'back-end', 'circuit.the', 'invention', 'also', 'discloses', 'the', 'method', 'using', 'above-mentioned', 'artificial', 'intelligence', 'skin', 'detection', 'humiture', 'and', 'pressure.the', 'configuration', 'of', '

### POS Tag

In [10]:
# Function to perform POS tagging on each token
def pos_tag_tokens(tokenized_sentences):
    pos_tagged_sentences = []
    for sentence_tokens in tokenized_sentences:
        pos_tags = pos_tag(sentence_tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

# Perform POS tagging on tokenized sentences
data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# Print the updated DataFrame
print(data['pos_tagged'][0])

[[('a', 'DT'), ('kind', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('skin', 'NN'), ('and', 'CC'), ('its', 'PRP$'), ('method', 'NN'), ('for', 'IN'), ('detection', 'NN'), ('humiture', 'NN'), ('and', 'CC'), ('pressure', 'NN')], [('the', 'DT'), ('invention', 'NN'), ('discloses', 'VBZ'), ('a', 'DT'), ('kind', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('skin', 'NN')], [('it', 'PRP'), ('includes', 'VBZ'), ('signal', 'JJ'), ('acquisition', 'NN'), ('part', 'NN'), ('and', 'CC'), ('circuit', 'NN'), ('part', 'NN')], [('signal', 'JJ'), ('acquisition', 'NN'), ('part', 'NN'), ('point', 'NN'), ('includes', 'VBZ'), ('humidity', 'NN'), ('sensitive', 'JJ'), ('material', 'NN'), ('layer', 'NN')], [('first', 'RB'), ('electrode', 'VBZ'), ('layer', 'NN')], [('insulating', 'VBG'), ('layer', 'NN')], [('the', 'DT'), ('second', 'JJ'), ('electrode', 'NN'), ('lay', 'VBD')], [('thermistor', 'NN'), ('material', 'NN'), ('layer', 'NN')], [('the', 'DT'), ('3rd', 'CD'), ('

In [11]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged'].apply(lemmatize_tokens)

# Print the updated DataFrame
print(data['lemmatized'][0])

[[('a', 'DT'), ('kind', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('skin', 'NN'), ('and', 'CC'), ('it', 'PRP$'), ('method', 'NN'), ('for', 'IN'), ('detection', 'NN'), ('humiture', 'NN'), ('and', 'CC'), ('pressure', 'NN')], [('the', 'DT'), ('invention', 'NN'), ('disclose', 'VBZ'), ('a', 'DT'), ('kind', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN'), ('skin', 'NN')], [('it', 'PRP'), ('include', 'VBZ'), ('signal', 'JJ'), ('acquisition', 'NN'), ('part', 'NN'), ('and', 'CC'), ('circuit', 'NN'), ('part', 'NN')], [('signal', 'JJ'), ('acquisition', 'NN'), ('part', 'NN'), ('point', 'NN'), ('include', 'VBZ'), ('humidity', 'NN'), ('sensitive', 'JJ'), ('material', 'NN'), ('layer', 'NN')], [('first', 'RB'), ('electrode', 'VBZ'), ('layer', 'NN')], [('insulate', 'VBG'), ('layer', 'NN')], [('the', 'DT'), ('second', 'JJ'), ('electrode', 'NN'), ('lay', 'VBD')], [('thermistor', 'NN'), ('material', 'NN'), ('layer', 'NN')], [('the', 'DT'), ('3rd', 'CD'), ('electr

### POS Chunk

In [12]:
# pos chunk verb noun pairs

def extract_verb_noun_pairs(tagged_texts):
    chunks = []
    for tagged_sentence in tagged_texts:
        result = filter_verb_noun_pairs(tagged_sentence)
        chunks.append(result)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def filter_verb_noun_pairs(tagged_sentence):
    verb_noun_pairs = []
    for i in range(len(tagged_sentence) - 1):
        word, tag = tagged_sentence[i]
        next_word, next_tag = tagged_sentence[i + 1]
        if tag.startswith('VB') and next_tag.startswith('NN'):
            verb = word
            noun = next_word
            verb_noun_pairs.append((verb, noun))
    return verb_noun_pairs

# Apply chunking and extraction to the 'tagged_sentences' column
data['chunked_verb_noun'] = data['lemmatized'].apply(extract_verb_noun_pairs)

# Print the resulting DataFrame with chunked data
data['chunked_verb_noun']

0         [(include, humidity), (electrode, layer), (ins...
1         [(distribute, weight), (establish, information...
2                    [(train, stage), (reside, background)]
3         [(find, father), (define, sub-topicses), (perf...
4         [(enhance, sentiment), (save, cost；sentiment),...
                                ...                        
138780    [(improve, performance), (recognise, traffic),...
138781    [(acquire, surface), (derive, correction), (ca...
138782                                                   []
138783    [(predict, method), (predict, method), (lst, f...
138784    [(microwave, source), (microwave, radiation), ...
Name: chunked_verb_noun, Length: 138785, dtype: object

In [13]:
# POS chunk new pattern (Pak Indra)

# define pattern
pattern = r'''Chunk: {(<ADJ|VB\w*>+<\.|RP|IN|CC|PRP\w*>*<DT>*<NN\w*|RB\w*>+)+}'''
chunk_parser = RegexpParser(pattern)

def extract(tagged_texts):
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['chunked_new_pattern'] = data['lemmatized'].apply(extract)

# Print the resulting DataFrame with chunked data
data['chunked_new_pattern']



0         [(disclose, a, kind), (include, humidity), (el...
1         [(disclose, a, kind), (be, distribute, weight)...
2         [(train, stage), (scene, simultaneously, mark,...
3         [(disclose, a, kind), (be, put, into, default,...
4         [(disclose, a, kind), (dictionary；based, on, t...
                                ...                        
138780    [(improve, performance), (recognise, traffic, ...
138781    [(provide, a, method), (be, perform, by, a, co...
138782    [(comprise, a, head, mask, structure), (protec...
138783    [(predict, method), (base, on, machine, learni...
138784    [(microwave, source), (microwave, radiation, s...
Name: chunked_new_pattern, Length: 138785, dtype: object

### Export Result

In [14]:
# hasile

patent_task = data[["title_and_abstract", "chunked_verb_noun", "chunked_new_pattern"]]
patent_task.to_excel('export_result/paten_task_new.xlsx', index=False)