# README

### Purpose of this notebook
- Tokenize comments with different tokenizer including:
    - CKIP
    - Articut
    - MONPA (under development)

## Steps
1. Read the tokenized comment dataframe.
2. Import the tokenizer module and tokenize the comment accordingly.
3. (Optional) Term frequency observation.

In [None]:
import pandas as pd
from importlib import reload

# Utility variable
import sys
sys.path.insert(0, '../..')

# utils
import utils.data as D
import utils.articut as A

## Read the comment data

In [None]:
df_comments = D.read_df_comments()

In [None]:
df_tokenized_comments = df_comments[['year', 'id', 'group', 'committee_number', 'comment', 'comment_length']]

In [None]:
# df_tokenized_comments = D.read_df_tokenized_comments()
# df_tokenized_comments.head()

## Sentence Tokenization and POS tagging with CKIP

In [None]:
from ckip_transformers.nlp import CkipWordSegmenter, CkipPosTagger, CkipNerChunker

In [None]:
MODEL = 'ckiplab/albert-tiny-chinese'
WS_MODEL = "{}-ws".format(MODEL)
POS_MODEL = "{}-pos".format(MODEL)
NER_MODEL = "{}-ner".format(MODEL)

In [None]:
# Initialize drivers
ws_driver = CkipWordSegmenter(level=3, model_name=WS_MODEL, device=0)
pos_driver = CkipPosTagger(level=3, model_name=POS_MODEL, device=0)
ner_driver = CkipNerChunker(level=3, model_name=NER_MODEL, device=0)

In [None]:
if MODEL == 'ckiplab/albert-tiny-chinese':
    BATCH_SIZE = 256
elif MODEL == 'ckiplab/albert-base-chinese':
    BATCH_SIZE = 128
elif MODEL == 'ckiplab/bert-base-chinese':
    BATCH_SIZE = 128

In [None]:
# list of comments
comments = list(df_comments.comment)

In [None]:
# word segmentation
comments_ws = ws_driver(comments, batch_size=BATCH_SIZE)

In [None]:
df_tokenized_comments['ckip_comment_ws'] = comments_ws

In [None]:
# part of speech
comments_pos = pos_driver(comments_ws, batch_size=128)

In [None]:
df_tokenized_comments['ckip_comment_pos'] = comments_pos

In [None]:
# name entity recognition
comment_ner = ner_driver(comments, batch_size=128)

In [None]:
df_tokenized_comments['ckip_comment_ner'] = comment_ner

In [None]:
df_tokenized_comments.head()

In [None]:
D.write_df_tokenized_comments(df_tokenized_comments, file='csv')
D.write_df_tokenized_comments(df_tokenized_comments, file='pkl')

## Sentence Tokenization and POS tagging with Articut

### Articut lv2

In [None]:
buf_lv2 = df_tokenized_comments.comment.progress_apply(A.articut_cut, lv="lv2")

In [None]:
df_tokenized_comments['articut_lv2'] = buf_lv2

In [None]:
D.write_df_tokenized_comments(df_tokenized_comments, file='csv')
D.write_df_tokenized_comments(df_tokenized_comments, file='pkl')

### Articut lv3

In [None]:
buf_lv3 = df_tokenized_comments.comment.progress_apply(A.articut_cut, lv="lv3")

In [None]:
df_tokenized_comments['articut_lv3'] = buf_lv3

In [None]:
D.write_df_tokenized_comments(df_tokenized_comments, file='csv')
D.write_df_tokenized_comments(df_tokenized_comments, file='pkl')

### Articut wiki lv2

In [None]:
buf_wiki_lv2 = df_tokenized_comments.comment.progress_apply(A.articut_cut, lv="lv2", wikiDataBOOL=True)

In [None]:
df_tokenized_comments['articut_wiki_lv2'] = buf_wiki_lv2

In [None]:
D.write_df_tokenized_comments(df_tokenized_comments, file='csv')
D.write_df_tokenized_comments(df_tokenized_comments, file='pkl')

### Articut wiki lv3

In [None]:
buf_wiki_lv3 = df_tokenized_comments.comment.progress_apply(A.articut_cut, lv="lv3", wikiDataBOOL=True)

In [None]:
df_tokenized_comments['articut_wiki_lv3'] = buf_wiki_lv3

In [None]:
D.write_df_tokenized_comments(df_tokenized_comments, file='csv')
D.write_df_tokenized_comments(df_tokenized_comments, file='pkl')

In [None]:
df_tokenized_comments

## Term frequency observation

In [None]:
from collections import Counter

In [None]:
pos_filter = ['UserDefined', 'ENTITY_noun', 'ENTITY_oov']

In [None]:
cnt = Counter()
cnt_pos_filtered = Counter()

for res in df_tokenized_comments.articut_lv2:
    if res['status'] == False:
        continue
#     print(res)
    
    for sent_token in res['result_obj']:
        for token in sent_token:
            if token['pos'] == 'PUNCTUATION':
                continue
        
#             p = token['text']
            p = (token['text'], token['pos'])
            cnt[p] += 1
        
            ## pos filter
            for pos in pos_filter:
                if pos in token['pos']:
                    cnt_pos_filtered[p] += 1

In [None]:
cnt.total()

In [None]:
cnt.most_common(3000)

In [None]:
cnt_pos_filtered.most_common(500)