# README

### Purpose of this notebook
- Split comments into sentences.

## Steps
1. Finely split sentence with punctuation.
2. Use BERT next sentence prediction to concatenate sentence back (a bottom-up approach).
3. Perform EDA to observe the split results.

In [1]:
import pandas as pd

import re
import os
import json
from importlib import reload

from itertools import chain
from collections import defaultdict
import matplotlib.pyplot as plt

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

# Utility variable
import sys
sys.path.insert(0, '../..')

# var
import var.var as V

# utils
import utils.articut as A
import utils.data as D
import utils.io as IO
import utils.preprocess as PP

In [2]:
with open("../../var/articut_dict.json") as f:
    keyword_dict = json.load(f)

## Read the dataframe

In [3]:
df_comments = D.read_df_comments()

In [None]:
df_comments.grade.value_counts()

In [None]:
df_comments['comment_length'] = df_comments['comment'].apply(
    lambda s: len(s) if not PP.is_empty_sent(s) else np.NaN
)

## Sentences Preprocess

### Split sentence with Chinese and english punctuation
- Can not use nltk to tokenize Chinese sentences

In [7]:
import string
from zhon import hanzi # Chinese text processing package

In [8]:
hanzi.non_stops

'＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·'

In [9]:
hanzi.stops

'！？｡。'

In [10]:
V.EN_PUNC_STOPS

'!;?.'

In [11]:
V.NO_DIGIT_SURROUNDING_PERIOD

'(?<=[^\\d])(\\.)(?=[^\\d])'

In [12]:
re_split_punc = "[！？｡。，、；,!;?\n]|(?<=\D)\.(?= )"
re_split_punc

'[！？｡。，、；,!;?\n]|(?<=\\D)\\.(?= )'

In [13]:
def split_sentence(s):
    s = '\n' + str(s)
    
    ## remove grade comment
    p = '|'.join(PP.GRADE_COMMENT).replace('(', '\(').replace(')', '\)')
    s = re.sub(p, '', s)
    ## remove time token
    p = '20\d\d[年]*|10[1-9][ ]*[年|年度|學年|學年度]*|11\d[ ]*[年|年度|學年|學年度]*'
    s = re.sub(p, '', s)
    ## remove competition rank
    ch_number = "一二三四五六七八九十"
    p = '第[{}\d]+名|{}'.format(ch_number, '|'.join(keyword_dict['prize']))
    s = re.sub(p, '', s)
    ## remove ordinal hint
    p = '第[{}\d]+屆'.format(ch_number)
    s = re.sub(p, '', s)
    ## Add \n spliter before number bullet
    p = '((?<!\d)\d+\.(?!\d)|★)'
    s = re.sub(p, r'\n',s)
    ## replace english comma surrounded by Chinese characters with Chinese comma
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    p = "(?<={}),|,(?={})".format(re_ch_p, re_ch_p)
    s = re.sub(p, '，', s)
    ## replace english period surrounded by Chinese characters with Chinese period
    p = "(?<={})\.(?=\D)|(?<=\D)\.(?={})".format(re_ch_p, re_ch_p)
    s = re.sub(p, '。', s)
    
    ## split sentence with punctuation
    punc_list = re.findall(re_split_punc, s.strip())
    sent = re.split(re_split_punc, s.strip())

    ## combine split sentence with punctuation
    buf = []
    for i in range(len(sent)):
        try:
            buf.append(sent[i] + punc_list[i])
        except:
            ## end of the list
            buf.append(sent[i])
    sent = buf
    
    ## split whitespace between chinese character (except for english sentence)
    sent = list(chain.from_iterable([split_whitespace(_s) for _s in sent]))
    ## remove empty string
    sent = [_s for _s in sent if not PP.is_empty_sent(_s)]
    ## remove preceeding punctuation
    sent = [_s if _s[0] not in (hanzi.non_stops + "。" + V.EN_PUNC_NON_STOPS) else _s[1:] for _s in sent]
    sent = [_s.strip() for _s in sent]
    sent = [_s for _s in sent if not PP.is_empty_sent(_s)]
    ## remove trailing punctuation if it is none stop punctuation
    sent = [_s if _s[-1] not in (hanzi.non_stops + "。.;" + V.EN_PUNC_NON_STOPS) else _s[:-1] for _s in sent]
    sent = [_s.strip() for _s in sent]
    sent = [_s for _s in sent if not PP.is_empty_sent(_s)]
    
    return sent

def split_whitespace(s):
    ## remove multiple whitespaces
    s = re.sub('\s{2,}', ' ', s)
    
    ## All Chinese characters and punctuations
    re_ch_p = '[{}]'.format(hanzi.characters + hanzi.punctuation)
    ## Whitespaces between Chinese characters and punctuations
    ws_btn_ch = '(?<={})\s(?={})'.format(re_ch_p, re_ch_p)
    
    sent = re.split(ws_btn_ch, s)
    
    return sent

### Testing

In [14]:
"# The content is removed due to confidential concerns."

### Split comments

In [None]:
df_comments.head()

In [34]:
df_comments['split_comment'] = df_comments['comment'].apply(split_sentence)

In [None]:
for _, row in df_comments.sample(100, random_state=42).iterrows():
    _comment = row['comment']
    _split_comment = row['split_comment']
    
    if not PP.is_empty_sent(_comment):
        print(_comment)
        print(_split_comment)
        IO.print_dividing_line()

## Merge sentences with BERT next sentence prediction

### Prepare BERT model

In [36]:
import utils.next_sentence_prediction as NSP

In [36]:
from transformers import BertTokenizer, BertForNextSentencePrediction

In [37]:
model_name = "hfl/chinese-macbert-base"
# model_name = "bert-base-multilingual-cased"

In [38]:
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForNextSentencePrediction.from_pretrained(model_name)

Some weights of the model checkpoint at hfl/chinese-macbert-base were not used when initializing BertForNextSentencePrediction: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Merge utility function

In [39]:
def merge_split_sentence(split_sent):
    if len(split_sent) <= 1:
        return split_sent
    
    merged_sent = [split_sent[0]]
    
    for i in range(1, len(split_sent)):
        prompt = merged_sent[-1]
        next_sentence = split_sent[i]
        
        ## merge if predicted as next sentence
        if NSP.is_next_sentence(model, tokenizer, prompt, next_sentence):
            merged_sent[-1] = prompt + next_sentence
        ## not merge
        else:
            merged_sent.append(split_sent[i])
            
    return merged_sent

In [None]:
df_comments['split_comment_nsp'] = df_comments['split_comment'].progress_apply(merge_split_sentence)

In [None]:
for _, row in df_comments.sample(100, random_state=42).iterrows():
    _comment = row['comment']
    _split_comment = row['split_comment']
    _split_comment_nsp = row['split_comment_nsp']
    
    if not PP.is_empty_sent(_comment):
        print(_comment)
        print(_split_comment)
        print(_split_comment_nsp)
        IO.print_dividing_line()

## Dataframe for split comments

In [None]:
split_comments_row_data = []
split_comments_original_comment_dict = defaultdict(list)
split_comments_applicant_dict = defaultdict(list)
split_comments_committee_dict = defaultdict(list)

for _, row in df_comments.iterrows():
    _year = row['year']
    _id = row['id']
    _committee_number = row['committee_number']
    original_comment = row['comment']
    grade = row['grade']
    split_comment = row['split_comment']
    
    if not split_comment:
        continue
    
    for sc in split_comment:
        split_comments_row_data.append({
            "split_comment": sc,
            "grade": grade,
        })
        
        split_comments_original_comment_dict[sc].append(((_year, _id, _committee_number), original_comment))
        split_comments_applicant_dict[sc].append((_year, _id))
        split_comments_committee_dict[sc].append((_year, _id, _committee_number))
        
df_split_comments = pd.DataFrame(split_comments_row_data)
df_split_comments

In [37]:
split_comments_list = df_comments['split_comment'].to_list()
split_comments_list = [_list for _list in split_comments_list if _list]
split_comments = list(chain.from_iterable(split_comments_list))

In [None]:
len(split_comments)

In [39]:
df_split_comments['split_comment_length'] = df_split_comments['split_comment'].apply(len)

### Draw the distribution of the number of split sentences in each comment

In [46]:
# BINS = 10

In [None]:
# plt.hist([len(l) for l in split_comments_list], bins=BINS)

## Draw the length distribution of split sentences

In [None]:
# plt.hist([len(c) for c in split_comments], bins=BINS)

#### Length below 40

In [None]:
# plt.hist([len(c) for c in split_comments if len(c) < 40], bins=10)

## Content of short sentences

In [48]:
from collections import Counter

In [None]:
# counter = Counter([len(c) for c in split_comments])
# sorted(counter.items(), key=lambda x:x[0])

### 在不同的 split comment 長度中 sample split comment

In [53]:
# df_split_comments_length_group = df_split_comments.groupby(['split_comment_length'])

In [None]:
# for length, group in df_split_comments_length_group:
#     print(length)
    
#     num_samples = min(group.shape[0], 30)
    
#     print(group['split_comment'].sample(num_samples, random_state=42))

### 在不同的 bin 中 sample split_comment

In [55]:
# bins = pd.cut(df_split_comments['split_comment_length'], BINS)
# df_split_comments_bin_group = df_split_comments.groupby(bins)

In [None]:
# for _bin, group in df_split_comments_bin_group:
#     print(_bin)
    
#     num_samples = min(group.shape[0], 50)
    
#     print(group['split_comment'].sample(num_samples, random_state=42))

### Filter out comments below length threshold

In [40]:
LEN_THRESHOLD = 1

In [41]:
keep_idx = df_split_comments['split_comment_length'].apply(lambda l: l > LEN_THRESHOLD)
df_split_comments = df_split_comments[keep_idx]

split_comments = [c for c in split_comments if len(c) > LEN_THRESHOLD]

## Split comments with no duplicate data

In [43]:
df_split_comments_no_duplicate = df_split_comments.drop_duplicates()
sc_duplicate_check_group = df_split_comments_no_duplicate.groupby('split_comment')

In [None]:
df_split_comments_no_duplicate.shape

In [None]:
## if same split comment receive different grade, then change the grade label to unknown
new_grade_list = []

for _, row in df_split_comments_no_duplicate.iterrows():
    sc = row['split_comment']
    grade = row['grade']
    g = sc_duplicate_check_group.get_group(sc)
    
    if g.shape[0] > 1:
        print(g)
        new_grade_list.append('P')
    else:
        new_grade_list.append(grade)
        
df_split_comments_no_duplicate['grade'] = new_grade_list
df_split_comments_no_duplicate = df_split_comments_no_duplicate.drop_duplicates()

In [47]:
df_split_comments_no_duplicate.reset_index(drop=True, inplace=True)

In [48]:
split_comments_no_duplicate = df_split_comments_no_duplicate['split_comment'].to_list()

In [None]:
len(split_comments_no_duplicate)

In [None]:
len(split_comments)

## Append applicant and committee information for calculating uniqueness

In [None]:
D.read_df_split_comments_no_duplicate()

In [82]:
df_split_comments_no_duplicate['applicants'] = df_split_comments_no_duplicate['split_comment'].apply(
    lambda sc: split_comments_applicant_dict[sc]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_split_comments_no_duplicate['applicants'] = df_split_comments_no_duplicate['split_comment'].apply(


In [83]:
df_split_comments_no_duplicate['committee'] = df_split_comments_no_duplicate['split_comment'].apply(
    lambda sc: split_comments_committee_dict[sc]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_split_comments_no_duplicate['committee'] = df_split_comments_no_duplicate['split_comment'].apply(


In [54]:
df_split_comments_no_duplicate['original_comment'] = df_split_comments_no_duplicate['split_comment'].apply(
    lambda sc: split_comments_original_comment_dict[sc]
)

## Split comments tokenization with Articut

### Gather texts to create request batch

In [177]:
def is_valid_split_token(sents, split_token='＄'):
    for sent in sents:
        if split_token in sent:
            return False
        
    return True

In [178]:
def create_articut_requests(sents, split_token='＄', max_len=2000):
    ## check if the split token is valid
    if not is_valid_split_token(sents, split_token):
        print("Not valid split token!")
        return []
    
    request_str_buffer = []
    str_buf = ""
    
    for sent in sents:
        append_str = sent + split_token
        if len(sent) >= max_len:
            print("Too long sentence detected！")
            return []
        
        if len(str_buf) + len(sent) + 1 < max_len:
            ## append the sent into the string buffer
            str_buf = str_buf + append_str
        else:
            ## flush the string buffer to request
            request_str_buffer.append(str_buf)
            str_buf = append_str
    
    request_str_buffer.append(str_buf)
    
    ## chech if missing any sent
    assert len(sents) == sum(
        [len(request_str.split(split_token)) - 1 for request_str in request_str_buffer]
    )
    
    return request_str_buffer

In [None]:
SPLITTER = '＄'
request_str_buffer = create_articut_requests(split_comments_no_duplicate, split_token=SPLITTER)
print("Number of requests: {}".format(len(request_str_buffer)))

In [None]:
articut_res_buffer = []

for request_str in tqdm(request_str_buffer):
    res = A.articut_cut(request_str, wikiDataBOOL=True, sleep=False)
    articut_res_buffer.append(res)

In [181]:
split_comment_articut_res_buffer = []

for i, (request_str, res) in enumerate(zip(request_str_buffer, articut_res_buffer)):
    num_str = len(request_str.split(SPLITTER)) - 1
    
    if res['status'] != True:
        print(i)
    
    exec_time = res['exec_time']
    result_pos = res['result_pos']
    result_segmentation = res['result_segmentation']
    result_obj = res['result_obj']
    level = res['level']
    version = res['version']
    msg = res['msg']
    word_count_balance = res['word_count_balance']
    
    ## split result_pos
    split_result_pos = []
    splitter = SPLITTER
    buf = []
    
    for pos in result_pos:
        if pos != splitter:
            buf.append(pos)
        elif pos == splitter and len(buf) > 0:
            ## flush the buffer
            split_result_pos.append(buf)
            buf = []
            
    ## split result_segmentation
    splitter = SPLITTER
    split_result_segmentation = [seg for seg in result_segmentation.split(splitter) if seg != '']
    ## remove preceeding '/'
    split_result_segmentation = [seg[1:] if seg[0] == '/' else seg for seg in split_result_segmentation ]
    ## remove trailing '/'
    split_result_segmentation = [seg[:-1] if seg[-1] == '/' else seg for seg in split_result_segmentation]
    
    ## split result_obj
    split_result_obj = []
    splitter = [{'text': SPLITTER, 'pos': 'PUNCTUATION'}]
    buf = []
    
    for obj in result_obj:
        if obj != splitter:
            buf.append(obj)
        elif obj == splitter and len(buf) > 0:
            ## flush the buffer
            split_result_obj.append(buf)
            buf = []
            
    assert len(split_result_pos) == num_str
    assert len(split_result_segmentation) == num_str
    assert len(split_result_obj) == num_str
    
    for pos, seg, obj in zip(split_result_pos, split_result_segmentation, split_result_obj):
        split_comment_articut_res_buffer.append({
            'exec_time': exec_time / len(split_result_pos),
            'result_pos': pos,
            'result_segmentation': seg,
            'result_obj': obj,
            'level': level,
            'version': version,
            'status': True,
            'msg': msg,
            'word_count_balance': word_count_balance,
        })
        
assert len(split_comment_articut_res_buffer) == len(split_comments_no_duplicate)

In [182]:
df_split_comments_no_duplicate['articut_wiki_lv2'] = split_comment_articut_res_buffer

In [None]:
df_split_comments_no_duplicate.tail()

In [None]:
df_split_comments_no_duplicate

## Observe split comment with next sentence prediction

In [115]:
split_comments_nsp_list = df_comments['split_comment_nsp'].to_list()
split_comments_nsp_list = [_list for _list in split_comments_nsp_list if _list != ['0']]
split_comments_nsp = list(chain.from_iterable(split_comments_nsp_list))

In [None]:
# len(split_comments_nsp)

In [None]:
# df_split_comments_nsp = pd.DataFrame({"split_comment_nsp": split_comments_nsp})
# df_split_comments_nsp

In [64]:
# df_split_comments_nsp['split_comment_nsp_length'] = \
#     df_split_comments_nsp['split_comment_nsp'].apply(len)

### Draw the distribution of the number of split sentences in each comment

In [65]:
# BINS = 10

In [None]:
# plt.hist([len(l) for l in split_comments_nsp_list], bins=BINS)

## Draw the length distribution of split sentences

In [None]:
# plt.hist([len(c) for c in split_comments_nsp], bins=BINS)

#### Length below 50

In [None]:
# plt.hist([len(c) for c in split_comments_nsp if len(c) < 50], bins=10)

## Content of short sentences

In [69]:
from collections import Counter

In [None]:
# counter = Counter([len(c) for c in split_comments_nsp])
# sorted(counter.items(), key=lambda x:x[0])

### 在不同的 split comment 長度中 sample split comment

In [None]:
# df_split_comments_nsp_length_group = df_split_comments_nsp.groupby(['split_comment_nsp_length'])

In [None]:
# for length, group in df_split_comments_nsp_length_group:
#     print(length)
    
#     num_samples = min(group.shape[0], 30)
    
#     print(group['split_comment_nsp'].sample(num_samples, random_state=42))

### 在不同的 bin 中 sample split_comment

In [None]:
# bins = pd.cut(df_split_comments_nsp['split_comment_nsp_length'], BINS)
# df_split_comments_nsp_bin_group = df_split_comments_nsp.groupby(bins)

In [None]:
# for _bin, group in df_split_comments_nsp_bin_group:
#     print(_bin)
    
#     num_samples = min(group.shape[0], 50)
    
#     print(group['split_comment_nsp'].sample(num_samples, random_state=42))

### Filter out comments with lenght threshold

In [75]:
LEN_THRESHOLD = 1

In [76]:
split_comments_nsp = [c for c in split_comments_nsp if len(c) > LEN_THRESHOLD]

## Split comments with no duplicate data

In [77]:
split_comments_nsp_no_duplicate = list(set(split_comments_nsp))

In [None]:
len(split_comments_nsp_no_duplicate)

## Write split comments

In [79]:
## High frequency used
D.write_df_comments(df_comments)
D.write_df_split_comments_no_duplicate(df_split_comments_no_duplicate)
D.write_split_comments_no_duplicate(split_comments_no_duplicate)

## Not used
D.write_df_split_comments(df_split_comments)
D.write_split_comments(split_comments)
D.write_split_comments_nsp(split_comments_nsp)
D.write_split_comments_nsp_no_duplicate(split_comments_nsp_no_duplicate)