### Import Libraries

In [1]:
import re
import pandas as pd
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.chunk import RegexpParser
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\oktav\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Preprocessing

In [4]:
# read data

data = pd.read_excel('data/isco - Copy.xlsx')
# data = data[data["ISCO 08 Code"].apply(lambda x: len(str(x)) > 3)]
data = data[["Title EN", "Definition", "Tasks include"]]
data.head()

Unnamed: 0,Title EN,Definition,Tasks include
0,Commissioned Armed Forces Officers,Commissioned armed forces officers provide lea...,
1,Non-commissioned Armed Forces Officers,Non-commissioned armed forces officers enforce...,
2,"Armed Forces Occupations, Other Ranks","Armed forces occupations, other ranks include ...",


In [5]:
# add . in the end of title
def add_period(text):
    if pd.notna(text):
        return text + ". "
    else:
        return text

# Apply the function to the 'title' column
data['Definition'] = data['Definition'].apply(add_period)

# merge title and abstract column
data['definition_and_task'] = data['Definition'] + data['Tasks include'].fillna('')

# Print the updated DataFrame
data.head()

Unnamed: 0,Title EN,Definition,Tasks include,definition_and_task
0,Commissioned Armed Forces Officers,Commissioned armed forces officers provide lea...,,Commissioned armed forces officers provide lea...
1,Non-commissioned Armed Forces Officers,Non-commissioned armed forces officers enforce...,,Non-commissioned armed forces officers enforce...
2,"Armed Forces Occupations, Other Ranks","Armed forces occupations, other ranks include ...",,"Armed forces occupations, other ranks include ..."


In [4]:
# check missing value

print(data.isna().sum())

Title EN               0
Definition             0
Tasks include          6
definition_and_task    0
dtype: int64


In [5]:
# # remove missing value (tidak perlu karena kolom title and abstract tidak memiliki missing value)

# data.dropna(inplace=True)
# data.head()
data

Unnamed: 0,Title EN,Definition,Tasks include,definition_and_task
3,Legislators,"Legislators determine, formulate, and direct p...",Tasks include -\n(a) presiding over or partic...,"Legislators determine, formulate, and direct p..."
4,Senior Government Officials,Senior government officials advise governments...,"Tasks include -\n(a) advising national, state...",Senior government officials advise governments...
5,Traditional Chiefs and Heads of Villages,Traditional chiefs and heads of villages perfo...,Tasks include -\n(a) allocating the use of co...,Traditional chiefs and heads of villages perfo...
6,Senior Officials of Special-interest Organizat...,Senior officials of special-interest organizat...,Tasks include -\n(a) determining and formulat...,Senior officials of special-interest organizat...
8,Managing Directors and Chief Executives,Managing directors and chief executives formul...,"Tasks include -\n(a) planning, directing and ...",Managing directors and chief executives formul...
...,...,...,...,...
604,"Messengers, Package Deliverers and Luggage Por...","Messengers, package deliverers and luggage por...","Tasks include -\n(a) delivering messages, pac...","Messengers, package deliverers and luggage por..."
605,Odd Job Persons,"Odd job persons clean, paint and maintain buil...",Tasks include -\n(a) repairing broken windows...,"Odd job persons clean, paint and maintain buil..."
606,Meter Readers and Vending-machine Collectors,Meter readers and vending-machine collectors s...,Tasks include -\n(a) filling storage areas of...,Meter readers and vending-machine collectors s...
607,Water and Firewood Collectors,Water and firewood collectors collect water an...,Tasks include -\n(a) cutting and collecting w...,Water and firewood collectors collect water an...


In [6]:
# lowering text
data['lowered']= data['definition_and_task'].apply(lambda x: x.lower())

# Print the updated DataFrame
data['lowered'].head()

0    commissioned armed forces officers provide lea...
1    non-commissioned armed forces officers enforce...
2    armed forces occupations, other ranks include ...
Name: lowered, dtype: object

In [8]:
# Function to tokenize each sentence, tokenize each word from sentence, then remove punctuation and sopword
def tokenize_and_remove_punctuation(text):
    # Define stopword
    stop_words = set(stopwords.words('english'))

    # Tokenize each sentence
    sentences = re.split(r'(?<=[.!?,;])\s+', text)
    
    # Tokenize each word in each sentence, remove punctuation, and remove stopword
    tokenized_sentences = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        # Remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
        # Remove stopword
        tokens = [token for token in tokens if token not in stop_words]
        tokenized_sentences.append(tokens)
    return tokenized_sentences

# Tokenize each sentence and remove punctuation & stopword
data['tokenized'] = data['lowered'].apply(tokenize_and_remove_punctuation)

# Print the updated DataFrame
# print(data['tokenized'][3])

### POS Tag

In [9]:
# Function to perform POS tagging on each token
def pos_tag_tokens(tokenized_sentences):
    pos_tagged_sentences = []
    for sentence_tokens in tokenized_sentences:
        pos_tags = pos_tag(sentence_tokens)
        pos_tagged_sentences.append(pos_tags)
    return pos_tagged_sentences

# Perform POS tagging on tokenized sentences
data['pos_tagged'] = data['tokenized'].apply(pos_tag_tokens)

# Print the updated DataFrame
# print(data['pos_tagged'][3])

In [10]:
# Function to lemmatize tokens
def lemmatize_tokens(pos_tagged_sentences):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentences = []
    for sentence_tags in pos_tagged_sentences:
        lemmatized_tokens = [(lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos_tag)), pos_tag) for token, pos_tag in sentence_tags]
        lemmatized_sentences.append(lemmatized_tokens)
    return lemmatized_sentences

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return nltk.corpus.wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return nltk.corpus.wordnet.VERB
    elif treebank_tag.startswith('N'):
        return nltk.corpus.wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return nltk.corpus.wordnet.ADV
    else:
        return nltk.corpus.wordnet.NOUN  # Default to noun if POS tag not recognized

# Lemmatize the tokens in 'pos_tagged_sentences' column
data['lemmatized'] = data['pos_tagged'].apply(lemmatize_tokens)

# Print the updated DataFrame
# print(data['lemmatized'][3])

### POS Chunk

In [11]:
# POS chunk new pattern (Pak Indra)

# define pattern
pattern_raharjana = r'''Chunk: {<VB\w*>+<.|IN|CC|PRP\w*>*<DT>*<NN\w*|RB>+}'''
pattern_webb = r'''Chunk: {<VB\w*><NN\w*>}'''
pattern_johann = r'''Chunk: {<VB\w*><NN\w*>|<VB\w*><PRP\w*><NN\w*>|<VB\w*><NN\w*><NN\w*>|<VB\w*><JJ\w*><NN\w*>|<VB\w*><DT><NN\w*>|<VB\w*><NN\w*><IN><NN\w*>|<VB\w*><IN><JJ\w*><NN\w*>|<VB\w*><PRP\w*><JJ\w*><NN\w*>}'''

def extract(tagged_texts, pattern):
    chunk_parser = RegexpParser(pattern)
    chunks = []
    for tagged_text in tagged_texts:
        tree = chunk_parser.parse(tagged_text)
        tree = extract_chunks(tree)
        tree = trees_to_tuples(tree)
        chunks.append(tree)
    chunks = [item for sublist in chunks for item in sublist]
    return chunks

def extract_chunks(tree):
    chunks = []
    
    if isinstance(tree, nltk.Tree):
        if tree.label() != 'S':  # Exclude sentence-level chunks if any
            chunks.append(tree)
        for subtree in tree:
            chunks.extend(extract_chunks(subtree))
    
    return chunks

def trees_to_tuples(tree_list):
    tuple_list = [tuple(leaf[0] for leaf in tree.leaves()) for tree in tree_list]
    return tuple_list

# Apply chunking and extraction to the 'tagged_sentences' column
data['pattern_webb'] = data['lemmatized'].apply(lambda x: extract(x, pattern_webb))
data['pattern_raharjana'] = data['lemmatized'].apply(lambda x: extract(x, pattern_raharjana))
data['pattern_johann'] = data['lemmatized'].apply(lambda x: extract(x, pattern_johann))

# Print the resulting DataFrame with chunked data
data['pattern_webb']



0    [(provide, leadership), (arm, force), (perform...
1    [(arm, force), (arm, force), (perform, variety...
2     [(arm, force), (arm, force), (perform, variety)]
Name: pattern_webb, dtype: object

### Export Result

In [13]:
# hasile

# patent_task = data[["Title EN", "definition_and_task", "pattern_webb", "pattern_raharjana", "pattern_johann"]]
data.to_excel('data/keyword_extraction_isco - Copy.xlsx', index=False)