### Import Libraries

In [None]:
import re
import ast
import pandas as pd
import string
import nltk
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
# read data

data = pd.read_excel('patent.xlsx')
data.head()

Unnamed: 0,title_and_abstract
0,Recognizing typewritten and handwritten charac...
1,Aspect ratio conversion with machine learning....
2,Indoor scene structural estimation system and ...
3,"Soft robotic actuators for positioning, packag..."
4,System elasticity recovery method and system b...


### Preprocessing

In [None]:
# check missing value

print(data.isna().sum())

title_and_abstract    0
dtype: int64


In [None]:
# # remove missing value (tidak perlu karena kolom title and abstract tidak memiliki missing value)

# data.dropna(inplace=True)
# data.head()

In [None]:
# lowering text
data['title_and_abstract']= data['title_and_abstract'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['title_and_abstract'].head()

0    recognizing typewritten and handwritten charac...
1    aspect ratio conversion with machine learning....
2    indoor scene structural estimation system and ...
3    soft robotic actuators for positioning, packag...
4    system elasticity recovery method and system b...
Name: title_and_abstract, dtype: object

In [None]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove punctuation and sopword
def tokenize_and_remove_punctuation(text):
    # Define stopword
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?;])\s+', text)

    # Tokenize each word in each sentence, remove punctuation, and remove stopword
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Remove stopword
        tokens = [token for token in tokens if token not in stop_words]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

# Tokenize each sentence and remove punctuation & stopword
data['tokenized'] = data['title_and_abstract'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
data.head()

Unnamed: 0,title_and_abstract,tokenized
0,recognizing typewritten and handwritten charac...,"[[recognizing, typewritten, handwritten, chara..."
1,aspect ratio conversion with machine learning....,"[[aspect, ratio, conversion, machine, learning..."
2,indoor scene structural estimation system and ...,"[[indoor, scene, structural, estimation, syste..."
3,"soft robotic actuators for positioning, packag...","[[soft, robotic, actuators, positioning, packa..."
4,system elasticity recovery method and system b...,"[[system, elasticity, recovery, method, system..."


### POS Tag

In [None]:
# # Function to perform POS tagging on each token
# def pos_tag_tokens(tokenized_sentences):
#     pos_tagged_sentences = []
#     for sentence_tokens in tokenized_sentences:
#         pos_tags = pos_tag(sentence_tokens)
#         pos_tagged_sentences.append(pos_tags)
#     return pos_tagged_sentences

# # Perform POS tagging on tokenized sentences
# data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# # Print the updated DataFrame
# data.head()

In [None]:
import spacy

# Load the spaCy model with Penn Treebank tag set
nlp = spacy.load("en_core_web_sm")

# Function to perform POS tagging on e ach token using spaCy
def pos_tag_spacy(text):
    doc = nlp(" ".join(text))
    return [(token.text, token.tag_) for token in doc]

data['pos_tagged_spacy'] = data['tokenized'].apply(lambda x: [pos_tag_spacy(sublist) for sublist in x])
data.head()


Unnamed: 0,title_and_abstract,tokenized,pos_tagged_spacy
0,recognizing typewritten and handwritten charac...,"[[recognizing, typewritten, handwritten, chara...","[[(recognizing, VBG), (typewritten, JJ), (hand..."
1,aspect ratio conversion with machine learning....,"[[aspect, ratio, conversion, machine, learning...","[[(aspect, NNP), (ratio, NN), (conversion, NN)..."
2,indoor scene structural estimation system and ...,"[[indoor, scene, structural, estimation, syste...","[[(indoor, JJ), (scene, NN), (structural, JJ),..."
3,"soft robotic actuators for positioning, packag...","[[soft, robotic, actuators, positioning, packa...","[[(soft, JJ), (robotic, JJ), (actuators, NNS),..."
4,system elasticity recovery method and system b...,"[[system, elasticity, recovery, method, system...","[[(system, NN), (elasticity, NN), (recovery, N..."


In [None]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged_spacy'].apply(lemmatize_tokens)

# Print the updated DataFrame
data.head()

Unnamed: 0,title_and_abstract,tokenized,pos_tagged_spacy,lemmatized
0,recognizing typewritten and handwritten charac...,"[[recognizing, typewritten, handwritten, chara...","[[(recognizing, VBG), (typewritten, JJ), (hand...","[[(recognize, VBG), (typewritten, JJ), (handwr..."
1,aspect ratio conversion with machine learning....,"[[aspect, ratio, conversion, machine, learning...","[[(aspect, NNP), (ratio, NN), (conversion, NN)...","[[(aspect, NNP), (ratio, NN), (conversion, NN)..."
2,indoor scene structural estimation system and ...,"[[indoor, scene, structural, estimation, syste...","[[(indoor, JJ), (scene, NN), (structural, JJ),...","[[(indoor, JJ), (scene, NN), (structural, JJ),..."
3,"soft robotic actuators for positioning, packag...","[[soft, robotic, actuators, positioning, packa...","[[(soft, JJ), (robotic, JJ), (actuators, NNS),...","[[(soft, JJ), (robotic, JJ), (actuator, NNS), ..."
4,system elasticity recovery method and system b...,"[[system, elasticity, recovery, method, system...","[[(system, NN), (elasticity, NN), (recovery, N...","[[(system, NN), (elasticity, NN), (recovery, N..."


### POS Chunk

In [None]:
# define pattern
pattern_siahaan = r'''Chunk: {<VB\w*>+<.|IN|CC|PRP\w*>*<DT>*<NN\w*|RB>+}'''
pattern_webb = r'''Chunk: {<VB\w*><NN\w*>}'''
pattern_johann = r'''Chunk: {<VB\w*><NN\w*>|<VB\w*><PRP\w*><NN\w*>|<VB\w*><NN\w*><NN\w*>|<VB\w*><JJ\w*><NN\w*>|<VB\w*><DT><NN\w*>|<VB\w*><NN\w*><IN><NN\w*>|<VB\w*><IN><JJ\w*><NN\w*>|<VB\w*><PRP\w*><JJ\w*><NN\w*>}'''
new_pattern = r'''Chunk: {<VB\w*>+<PRP\w*|JJ\w*|DT|IN|CC|.>*<NN\w*>+}'''

def extract(tagged_texts, pattern):
    chunk_parser = RegexpParser(pattern)
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []

    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))

    return chunks

def trees_to_tuples(tree_list):
    # tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    tuple_list = [[(leaf[0], leaf[1]) for leaf in tree.leaves()] for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['pattern_webb'] = data['lemmatized'].apply(lambda x: extract(x, pattern_webb))
data['pattern_siahaan'] = data['lemmatized'].apply(lambda x: extract(x, pattern_siahaan))
data['pattern_johann'] = data['lemmatized'].apply(lambda x: extract(x, pattern_johann))
data['new_pattern'] = data['lemmatized'].apply(lambda x: extract(x, new_pattern))

# Print the resulting DataFrame with chunked data
data['pattern_webb']



0         [[(use, VBG), (end, NN)], [(disclose, VBN), (h...
1         [[(convert, VBG), (image, NN)], [(predict, VBG...
2         [[(include, VBZ), (2d, NNP)], [(receive, VBZ),...
3         [[(position, VBG), (packaging, NN)], [(relate,...
4         [[(disclose, VBZ), (system, NN)], [(follow, VB...
                                ...                        
110264    [[(provide, VBZ), (translation, NN)], [(link, ...
110265    [[(curve, VBN), (surface, NN)], [(climb, VBG),...
110266    [[(include, VBP), (device, NN)], [(comprise, V...
110267    [[(mount, VBG), (seat, NN)], [(mount, VBG), (s...
110268    [[(teach, VBG), (training, NN)], [(teach, VBG)...
Name: pattern_webb, Length: 110269, dtype: object

In [None]:
# Remove common mistake
common_mistake = {'ai', 'artificial', 'intelligence', 'system', 'model', 'method', 'invention'}

def filter_tuples(tuple_list):
    return [tup for tup in tuple_list if not any(keyword in tup for keyword in common_mistake)]

# Apply the function to the DataFrame column
data['pattern_webb'] = data['pattern_webb'].apply(filter_tuples)
data['pattern_johann'] = data['pattern_johann'].apply(filter_tuples)
data['pattern_siahaan'] = data['pattern_siahaan'].apply(filter_tuples)
data['new_pattern'] = data['new_pattern'].apply(filter_tuples)

### Visualize The result

In [None]:
# read data patent

patent_task = pd.read_excel('keyword_extraction_result_patent.xlsx')
patent_task['pattern_siahaan'] = patent_task['pattern_siahaan'].apply(ast.literal_eval)
patent_task['pattern_webb'] = patent_task['pattern_webb'].apply(ast.literal_eval)
patent_task['pattern_johann'] = patent_task['pattern_johann'].apply(ast.literal_eval)
patent_task['new_pattern'] = patent_task['new_pattern'].apply(ast.literal_eval)
patent_task.head()

Unnamed: 0,title_and_abstract,tokenized,pos_tagged_spacy,lemmatized,pattern_webb,pattern_siahaan,pattern_johann,new_pattern
0,recognizing typewritten and handwritten charac...,"[['recognizing', 'typewritten', 'handwritten',...","[[('recognizing', 'VBG'), ('typewritten', 'JJ'...","[[('recognize', 'VBG'), ('typewritten', 'JJ'),...","[[(use, VBG), (end, NN)], [(disclose, VBN), (h...","[[(use, VBG), (end, NN)], [(disclose, VBN), (h...","[[(use, VBG), (end, NN)], [(disclose, VBN), (h...","[[(recognize, VBG), (typewritten, JJ), (handwr..."
1,aspect ratio conversion with machine learning....,"[['aspect', 'ratio', 'conversion', 'machine', ...","[[('aspect', 'NNP'), ('ratio', 'NN'), ('conver...","[[('aspect', 'NNP'), ('ratio', 'NN'), ('conver...","[[(convert, VBG), (image, NN)], [(predict, VBG...","[[(disclose, VBD), (convert, VBG), (image, NN)...","[[(convert, VBG), (image, NN)], [(predict, VBG...","[[(disclose, VBD), (convert, VBG), (image, NN)..."
2,indoor scene structural estimation system and ...,"[['indoor', 'scene', 'structural', 'estimation...","[[('indoor', 'JJ'), ('scene', 'NN'), ('structu...","[[('indoor', 'JJ'), ('scene', 'NN'), ('structu...","[[(include, VBZ), (2d, NNP)], [(receive, VBZ),...","[[(include, VBZ), (2d, NNP), (encoder, NN), (2...","[[(base, VBN), (deep, JJ), (learning, NN)], [(...","[[(base, VBN), (deep, JJ), (learning, NN), (ne..."
3,"soft robotic actuators for positioning, packag...","[['soft', 'robotic', 'actuators', 'positioning...","[[('soft', 'JJ'), ('robotic', 'JJ'), ('actuato...","[[('soft', 'JJ'), ('robotic', 'JJ'), ('actuato...","[[(position, VBG), (packaging, NN)], [(relate,...","[[(position, VBG), (packaging, NN), (assemblin...","[[(position, VBG), (packaging, NN)], [(relate,...","[[(position, VBG), (packaging, NN), (assemblin..."
4,system elasticity recovery method and system b...,"[['system', 'elasticity', 'recovery', 'method'...","[[('system', 'NN'), ('elasticity', 'NN'), ('re...","[[('system', 'NN'), ('elasticity', 'NN'), ('re...","[[(disclose, VBZ), (system, NN)], [(follow, VB...","[[(disclose, VBZ), (system, NN), (elasticity, ...","[[(improve, VBN), (genetic, JJ), (algorithm, N...","[[(base, VBN), (improve, VBN), (genetic, JJ), ..."


In [None]:
# count the number of keyword

ai_task_siahaan = patent_task['pattern_siahaan'].apply(pd.Series).stack().tolist()
ai_task_webb = patent_task['pattern_webb'].apply(pd.Series).stack().tolist()
ai_task_johann = patent_task['pattern_johann'].apply(pd.Series).stack().tolist()
ai_task_new_pattern = patent_task['new_pattern'].apply(pd.Series).stack().tolist()

In [None]:
# Wordcloud AI Abilities Siahaan

# Flatten the list of tuples to get a single list of strings
words = [word for tuple_ in ai_task_siahaan for word in tuple_]

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


TypeError: sequence item 0: expected str instance, tuple found

In [None]:
# Wordcloud AI Abilities Johann

# Flatten the list of tuples to get a single list of strings
words = [word for tuple_ in ai_task_johann for word in tuple_]

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Wordcloud AI Abilities Webb

# Flatten the list of tuples to get a single list of strings
words = [word for tuple_ in ai_task_webb for word in tuple_]

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Wordcloud AI Abilities new_pattern

# Flatten the list of tuples to get a single list of strings
words = [word for tuple_ in ai_task_new_pattern for word in tuple_]

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(words))

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


### Export Result

In [None]:
# hasile

# patent_task = data[["title_and_abstract", "pattern_webb", "pattern_siahaan", "pattern_johann"]]
data.to_excel('extraction_result_patent.xlsx', index=False)