### Import Libraries

In [1]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
import pandas as pd
import string
string.punctuation

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Preprocessing

In [2]:
# read data

data = pd.read_csv('data/patent.csv')
data = data[["title", "abstract_localized.text"]]
data.head()

Unnamed: 0,title,abstract_localized.text
0,Fire detection system based on artificial inte...,The present disclosure relates to a fire detec...
1,,The present disclosure relates to a fire detec...
2,Embedding artificial intelligence for balancin...,Responsive to a CPU load of a specific access ...
3,,Responsive to a CPU load of a specific access ...
4,Optimization techniques for artificial intelli...,"Methods, apparatuses and computer readable med..."


In [3]:
# to repair json data, wrap with [all data] and add coma at the end of each row, like try.json
# import json

# with open('data/try.json', 'r') as json_file:
#     data_json = json.load(json_file)

In [4]:
data['title_and_abstract'] = data['title'].fillna('') + ' ' + data['abstract_localized.text']
data.head()

Unnamed: 0,title,abstract_localized.text,title_and_abstract
0,Fire detection system based on artificial inte...,The present disclosure relates to a fire detec...,Fire detection system based on artificial inte...
1,,The present disclosure relates to a fire detec...,The present disclosure relates to a fire dete...
2,Embedding artificial intelligence for balancin...,Responsive to a CPU load of a specific access ...,Embedding artificial intelligence for balancin...
3,,Responsive to a CPU load of a specific access ...,Responsive to a CPU load of a specific access...
4,Optimization techniques for artificial intelli...,"Methods, apparatuses and computer readable med...",Optimization techniques for artificial intelli...


In [5]:
# check missing value

print(data.isna().sum())

title                      234
abstract_localized.text      0
title_and_abstract           0
dtype: int64


In [6]:
# # remove missing value

# data.dropna(inplace=True)
# data.head()

In [7]:
# remove punctuation

def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

data['cleaned'] = data['title_and_abstract'].apply(lambda x:remove_punctuation(str(x)))
data['cleaned'].head()

0    Fire detection system based on artificial inte...
1     The present disclosure relates to a fire dete...
2    Embedding artificial intelligence for balancin...
3     Responsive to a CPU load of a specific access...
4    Optimization techniques for artificial intelli...
Name: cleaned, dtype: object

In [8]:
# tokenization

data['tokenized'] = data['title_and_abstract'].apply(nltk.word_tokenize)
data['clean_tokenized'] = data['cleaned'].apply(nltk.word_tokenize)
data['tokenized'].head

<bound method NDFrame.head of 0      [Fire, detection, system, based, on, artificia...
1      [The, present, disclosure, relates, to, a, fir...
2      [Embedding, artificial, intelligence, for, bal...
3      [Responsive, to, a, CPU, load, of, a, specific...
4      [Optimization, techniques, for, artificial, in...
                             ...                        
495    [In, an, intelligent, system, for, providing, ...
496    [Method, and, server, for, optimizing, hyperpa...
497    [A, method, and, server, for, optimizing, hype...
498    [Fallback, artificial, intelligence, system, f...
499    [There, are, provided, systems, and, methods, ...
Name: tokenized, Length: 500, dtype: object>

In [9]:
# remove stopwords

stopwords = nltk.corpus.stopwords.words('english')

def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

data['clean_tokenized'] = data['clean_tokenized'].apply(lambda x:remove_stopwords(x))
data['clean_tokenized'].head

<bound method NDFrame.head of 0      [Fire, detection, system, based, artificial, i...
1      [The, present, disclosure, relates, fire, dete...
2      [Embedding, artificial, intelligence, balancin...
3      [Responsive, CPU, load, specific, access, poin...
4      [Optimization, techniques, artificial, intelli...
                             ...                        
495    [In, intelligent, system, providing, recording...
496    [Method, server, optimizing, hyperparameter, t...
497    [A, method, server, optimizing, hyperparameter...
498    [Fallback, artificial, intelligence, system, r...
499    [There, provided, systems, methods, fallback, ...
Name: clean_tokenized, Length: 500, dtype: object>

### POS Tag

In [10]:
data['tagged'] = data['tokenized'].apply(nltk.pos_tag)
data['clean_tagged'] = data['clean_tokenized'].apply(nltk.pos_tag)
data['tagged'].head()

0    [(Fire, NNP), (detection, NN), (system, NN), (...
1    [(The, DT), (present, JJ), (disclosure, NN), (...
2    [(Embedding, VBG), (artificial, JJ), (intellig...
3    [(Responsive, JJ), (to, TO), (a, DT), (CPU, NN...
4    [(Optimization, NN), (techniques, NNS), (for, ...
Name: tagged, dtype: object

In [24]:
# lemmatizer = WordNetLemmatizer()

# def lemmatize_tuples(tuples):
#     lemmatizer = WordNetLemmatizer()
#     lemmatized_tuples = []
#     for tup in tuples:
#         lemmatized_words = [lemmatizer.lemmatize(word) for word in tup]
#         lemmatized_tuples.append(tuple(lemmatized_words))
#     return lemmatized_tuples

# data['lemmatized'] = data['tagged'].apply(lemmatize_tuples)
# data['clean_lemmatized'] = data['clean_tagged'].apply(lemmatize_tuples)
# data['lemmatized'].head()

In [27]:
# def get_wordnet_pos(treebank_tag):
#     if treebank_tag.startswith('J'):
#         return wordnet.ADJ
#     elif treebank_tag.startswith('V'):
#         return wordnet.VERB
#     elif treebank_tag.startswith('N'):
#         return wordnet.NOUN
#     elif treebank_tag.startswith('R'):
#         return wordnet.ADV
#     else:
#         return wordnet.NOUN  # Default to noun if POS tag not recognized

# lemmatizer = WordNetLemmatizer()

# data['lemmatized'] = data['tagged'].apply(
#     lambda tagged_sentences: [(lemmatizer.lemmatize(word, get_wordnet_pos(tag)), tag) for word, tag in tagged_sentences])
# print(data['lemmatized'])

0      [(Fire, NNP), (detection, NN), (system, NN), (...
1      [(The, DT), (present, JJ), (disclosure, NN), (...
2      [(Embedding, VBG), (artificial, JJ), (intellig...
3      [(Responsive, JJ), (to, TO), (a, DT), (CPU, NN...
4      [(Optimization, NN), (technique, NNS), (for, I...
                             ...                        
495    [(In, IN), (an, DT), (intelligent, NN), (syste...
496    [(Method, NNP), (and, CC), (server, NN), (for,...
497    [(A, DT), (method, NN), (and, CC), (server, NN...
498    [(Fallback, NNP), (artificial, JJ), (intellige...
499    [(There, EX), (be, VBP), (provide, VBN), (syst...
Name: lemmatized, Length: 500, dtype: object


In [30]:
# import pandas as pd
# import spacy

# # Load spaCy language model
# nlp = spacy.load('en_core_web_sm')

# # Function to lemmatize a list of tagged tokens
# def lemmatize_tagged_sentences(tagged_sentences):
#     lemmatized_sentences = []
#     for token in tagged_sentences:
#         word, pos = token
#         lemmatized_word = nlp(word)[0].lemma_
#         lemmatized_sentences.append((lemmatized_word, pos))
#     return lemmatized_sentences

# # Lemmatize the tagged sentences in each row of the DataFrame
# data['lemmatized'] = data['tagged'].apply(lemmatize_tagged_sentences)

# print(data['lemmatized'])


### POS Chunk

In [12]:
# # lemmatization

# lemmatizer = WordNetLemmatizer()

# def lemmatize_word(word, tag):
#     if tag.startswith('V'):
#         return lemmatizer.lemmatize(word, 'v')  # Verb
#     elif tag.startswith('N'):
#         return lemmatizer.lemmatize(word, 'n')  # Noun
#     else:
#         return word

In [13]:
# # pos chunk lama

# def filter_verb_noun_pairs(tagged_sentence):
#     verb_noun_pairs = []
#     for i in range(len(tagged_sentence) - 1):
#         word, tag = tagged_sentence[i]
#         next_word, next_tag = tagged_sentence[i + 1]
#         if tag.startswith('VB') and next_tag.startswith('NN'):
#             verb = lemmatize_word(word, tag)
#             noun = lemmatize_word(next_word, next_tag)
#             verb_noun_pairs.append((verb, noun))
#     return verb_noun_pairs

# data['chunked_verb_noun'] = data['clean_lemmatized'].apply(filter_verb_noun_pairs)

In [14]:
# pos chunk baru

pattern = r'''Chunk: {(<ADJ|VB\w*>+<.|RP|IN|CC|PRP\w*>*<DET>*<NN\w*|RB\w*>+)+}'''
chunk_parser = RegexpParser(pattern)

def extract(tagged_text):
    tree = chunk_parser.parse(tagged_text)
    chunks = extract_chunks(tree)
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

data['chunked'] = data['lemmatized'].apply(extract)
data['chunked'] = data['chunked'].apply(trees_to_tuples)

data['clean_chunked'] = data['clean_lemmatized'].apply(extract)
data['clean_chunked'] = data['clean_chunked'].apply(trees_to_tuples)

### Make the Result Better

In [15]:
# # lowering text

# data['msg_lower']= data['clean_msg'].apply(lambda x: x.lower())
# data['msg_lower'].head()

In [16]:
def lowercase_tuples(tuples):
    lowercased_tuples = []
    for tup in tuples:
        lowercased_words = tuple(word.lower() for word in tup)
        lowercased_tuples.append(lowercased_words)
    return lowercased_tuples

data['chunked'] = data['chunked'].apply(lowercase_tuples)
data['clean_chunked'] = data['clean_chunked'].apply(lowercase_tuples)

In [17]:
# # remove stopwords

# stopwords = nltk.corpus.stopwords.words('english')

# def remove_stopwords(text):
#     output= [i for i in text if i not in stopwords]
#     return output

# data['no_stopwords']= data['msg_tokenized'].apply(lambda x:remove_stopwords(x))
# data['no_stopwords'].head

In [18]:
# # lemmatization

# lemmatizer = WordNetLemmatizer()

# def lemmatize_word(word, tag):
#     if tag.startswith('V'):
#         return lemmatizer.lemmatize(word, 'v')  # Verb
#     elif tag.startswith('N'):
#         return lemmatizer.lemmatize(word, 'n')  # Noun
#     else:
#         return word

### Export Result

In [19]:
# hasile

patent_task = data[["title_and_abstract", 'chunked_verb_noun', "chunked", 'clean_chunked']]
patent_task.to_excel('export_result/chunk.xlsx', index=False)

KeyError: "['chunked_verb_noun'] not in index"