In [None]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
import matplotlib.pyplot as plt
from itertools import product, chain
from joblib import Parallel, delayed
from collections import Counter

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

from importlib import reload
import pickle

# Utility variable
import sys, getopt
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.data as D
import utils.io as IO
import utils.preprocess as PP
import utils.torch as Tor

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
os.environ['TOKENIZERS_PARALLELISM']= 'false'

## Process Command Line Arguments

In [None]:
opts, args = getopt.getopt(sys.argv[1:], "af:r:n:")

In [None]:
TRAIN_OR_ALL = 'train'
BATCH_SIZE = 70
RADIUS = 7.5
N_NEIGHBORS = 30

for opt, arg in opts:
    if opt == '-a':
        TRAIN_OR_ALL = 'all'
    elif opt == '-r':
        RADIUS = float(arg)
    elif opt == '-n':
        N_NEIGHBORS = int(arg)

## Read data
- need to know which comment chunk belong to which applicant

In [None]:
df_applicants = D.read_df_applicants(TRAIN_OR_ALL)
df_comments = D.read_df_comments()
df_split_comments = D.read_df_split_comments_no_duplicate(TRAIN_OR_ALL)
split_comments = D.read_split_comments_no_duplicate(TRAIN_OR_ALL)

In [None]:
split_comment_to_id = {sc: idx for idx, sc in zip(df_split_comments['split_comment'].index, df_split_comments['split_comment'].values)}

## Find original applicant for each split comment

In [None]:
# %%time
sc_applicant_lists = df_split_comments['applicants']
sc_committee_lists = df_split_comments['committee']

## Load the  embedding and the topics of each split comment

In [None]:
from bertopic import BERTopic
import utils.bertopic as BT

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
_pass = BT._pass
topic_doc_tokenizer = BT.topic_doc_tokenizer
vectorizer = CountVectorizer(tokenizer=topic_doc_tokenizer, lowercase=False)

In [None]:
SBERT_MODEL_NAME = 'ckiplab/bert-base-chinese'

if TRAIN_OR_ALL == 'train':
    BERTOPIC_MODEL_NAME = "BERTopic_custom_mcs_100_ckip_diversified_low_train"
elif TRAIN_OR_ALL == 'all':
    BERTOPIC_MODEL_NAME = "BERTopic_custom_mcs_100_ckip_diversified_low_all"
    
SPLITTER = '＄'

In [None]:
topic_model = BERTopic.load(os.path.join(P.FP_COMMENT_CLUSTERING_MODEL_DIR, BERTOPIC_MODEL_NAME))
print("Load BERTopic model success.")

In [None]:
sbert_model = topic_model.embedding_model.embedding_model

In [None]:
df_tokenization_database = df_split_comments

In [None]:
sentence_bert = topic_model.embedding_model.embedding_model

In [None]:
split_comments_embeds = sentence_bert.encode(split_comments, show_progress_bar=False)

In [None]:
reduced_split_comments_embeds = topic_model.umap_model['umap'].embedding_
reduced_split_comments_embeds = topic_model.umap_model['norm'].transform(reduced_split_comments_embeds)
reduced_split_comments_embeds.shape

In [None]:
import hdbscan

In [None]:
_, probs = hdbscan.approximate_predict(
    topic_model.hdbscan_model, reduced_split_comments_embeds
)
topics = topic_model.hdbscan_model.labels_

topics = topic_model._map_predictions(topics)
probs = topic_model._map_probabilities(probs, original_topics=True)
topic_labels = topics

In [None]:
def get_topic(s):
    idx = split_comments.index(s)
    return topics[idx]

In [None]:
import torch

In [None]:
from sentence_transformers.util import cos_sim

In [None]:
# %%time
split_comments_sim_mat = cos_sim(split_comments_embeds, split_comments_embeds)

## comments embed

In [None]:
comments = []
app_comments_idx_dict = defaultdict(list)

for _, row in df_comments.iterrows():
    _year = row['year']
    _id = row['id']
    comment = row['comment']
    
    if PP.is_empty_sent(comment):
        continue
    
    app_comments_idx_dict[(_year, _id)].append(len(comments))
    comments.append(comment)

In [None]:
comments_embeds = sentence_bert.encode(comments, show_progress_bar=False)

In [None]:
split_comments_and_comments_sim_mat = cos_sim(split_comments_embeds, comments_embeds)

## Calculate chunk consensus for each applicant

In [None]:
chunk_row_data_list = []

for _, row in df_split_comments.iterrows():
    sc = row['split_comment']
    committee = row['committee']
    
#     print(sc, committee)
    
    for com in committee:
        chunk_row_data = {
            "year": com[0],
            "id": com[1],
            "committee_number": com[2],
            "split_comment": sc
        }
        chunk_row_data_list.append(chunk_row_data)

In [None]:
df_chunk = pd.DataFrame(chunk_row_data_list)
df_chunk.head()

## Find the committee that does not write comments

In [None]:
df_comment_committee_group = df_comments.groupby(['year', 'group', 'committee_number'])

In [None]:
empty_comment_rate_threshold = 0.9

In [None]:
committee_empty_comment_rate_dict = {}
empty_comment_committee_list = []

for committee, g in df_comment_committee_group:
    comment_cnt = g.shape[0]
    
    empty_comment_cnt = 0
#     comments = []
    for comment in g['comment']:
        if PP.is_empty_sent(comment):
            empty_comment_cnt += 1
#         else:
#             comments.append(comment)
    
    empty_comment_rate = empty_comment_cnt / comment_cnt
    print(committee, "empty_comment_rate: {:.3f}".format(empty_comment_rate))
    committee_empty_comment_rate_dict[committee] = empty_comment_rate
    
    if empty_comment_rate > empty_comment_rate_threshold:
        empty_comment_committee_list.append(committee)
    
#     # calculate comment diversity
#     embeds = sentence_bert.encode(comments, show_progress_bar=False)
#     sim_mat = np.array(cos_sim(embeds, embeds))
#     mean_similarity = np.mean((np.sum(sim_mat, axis=-1) - 1) / (len(comments) - 1))
#     print(committee, "mean_similarity: {:.3f}".format(mean_similarity))
#     print(comments)
    
#     IO.print_dividing_line()
    
#     print(g)

In [None]:
sorted(committee_empty_comment_rate_dict.items(), key=lambda item: -item[1])

In [None]:
empty_comment_committee_list

## Calculate the number of committee per group

In [None]:
df_applicant_group = df_comments.groupby(['year', 'group'])

In [None]:
app_group_committee_count = {}

for app_group, g in df_applicant_group:
#     num_committee = g.groupby(['committee_number']).ngroups
    group_committee = g.groupby(['committee_number']).groups.keys()
    num_committee = sum([1 for com in group_committee if (*app_group, com) not in empty_comment_committee_list ])
    
    app_group_committee_count[app_group] = num_committee

In [None]:
app_group_committee_count

## Find neighbors by BERTScore
1. Find the neighbors based on r or k
2. Calculate BERTScore and filter neighbors

In [None]:
RADIUS = 7.5
N_NEIGHBORS = 30

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
neigh = NearestNeighbors(metric='minkowski') ## or 'cosine'
neigh.fit(split_comments_embeds)

In [None]:
# %%time
r_neigh_dist, r_neighbor_ind = neigh.radius_neighbors(split_comments_embeds, RADIUS)

In [None]:
# # %%time
k_neigh_dist, k_neighbor_ind = neigh.kneighbors(split_comments_embeds, N_NEIGHBORS)

In [None]:
k_neighbor_ind

In [None]:
neigh_count_within_r = np.array([len(neighbor_ind) for neighbor_ind in r_neighbor_ind])

In [None]:
plt.plot(np.sort(neigh_count_within_r))

In [None]:
sc_neighbor_distance = []
sc_neighbor_index = []

for rnd, rni, knd, kni in zip(r_neigh_dist, r_neighbor_ind, k_neigh_dist, k_neighbor_ind):
    ## apply k nearest neighbors
    if len(rni) < N_NEIGHBORS:
        sc_neighbor_distance.append(rnd)
        sc_neighbor_index.append(rni)
    else:
        sc_neighbor_distance.append(rnd)
        sc_neighbor_index.append(rni)

In [None]:
neighbor_count = np.array([len(nind) for nind in sc_neighbor_index])

In [None]:
plt.plot(np.sort(neighbor_count))

## Aggregate the referred applicants of all neighbors

In [None]:
# %%time
print("aggregate the referred applicants of all neighbors...")

applicants_of_neighbor = []
committees_of_neighbor = []

for nind in tqdm(filtered_sc_neighbor_index):
    applicants = set()
    committees = set()
    
    for nidx in nind:
        sc_applicants = sc_applicant_lists.iloc[nidx]
        sc_committees = sc_committee_lists.iloc[nidx]
        
        for app in sc_applicants:
            applicants.add(app)
            
        for com in sc_committees:
            committees.add(com)
            
    applicants_of_neighbor.append(applicants)
    committees_of_neighbor.append(committees)

In [None]:
def get_neighbor(sc, debug=False):
    idx = split_comments.index(sc)
    print(idx)
    row = df_chunk.query("`split_comment` == @sc")
    print(row)
    
    print("split comment:", split_comments[idx])
    print("Neighbors:")
    
    for nidx in filtered_sc_neighbor_index[idx]:
        if debug:
            print("\"{}\",".format(split_comments[nidx]))
        else:
#             print('  ', split_comments[nidx])
            print('  ', nidx, sc_applicant_lists.iloc[nidx], split_comments[nidx])

## Calculate uniqueness score
- inverse applicant frequency
- consensus rate

In [None]:
num_applicants = df_applicants.shape[0]
df_comment_applicant_group = df_comments.groupby(['year', 'id'])

In [None]:
app_committee_count = {}

for _, row in df_comments.iterrows():
    app = (row['year'], row['id'])
    committee_count = app_group_committee_count[(row['year'], row['group'])] 
    
    app_committee_count[app] = committee_count

In [None]:
# neigh_app, neigh_com

In [None]:
print("calculate uniqueness score...")

split_comments_uniqueness = []
split_comments_iaf = []
# split_comments_iccr = []
split_comments_ccr = []

split_comments_iaf_dict = {}

for idx, (sc, neigh_app, neigh_com) in tqdm(enumerate(zip(split_comments, applicants_of_neighbor, committees_of_neighbor))):
    ## inverse applicant frequency
    iaf = np.log(num_applicants / len(neigh_app))
    ## conmittee consensus rate
    all_hit_applicant_committee = sum([
        app_committee_count[app] for app in neigh_app
    ])
    mention_hit_applicant_committee = len(neigh_com)
    
    ccr = mention_hit_applicant_committee / all_hit_applicant_committee
    iccr = (np.log(all_hit_applicant_committee / mention_hit_applicant_committee) + 1) ** -1
    
#     uniqueness = iaf * iccr
    uniqueness = iaf * ccr
    
    split_comments_uniqueness.append(uniqueness)
    split_comments_iaf.append(iaf)
    split_comments_iccr.append(iccr)
    split_comments_ccr.append(ccr)
    
    split_comments_iaf_dict[sc] = iaf

### Plot uniqueness

In [None]:
fig, axs = plt.subplots(4, 1, figsize=(5, 10), constrained_layout=True)

## Uniqueness
_ = axs[0].plot(np.sort(split_comments_uniqueness)[::-1])
_ = axs[0].set_title("Sorted uniqueness")

## iaf
_ = axs[1].plot(np.sort(split_comments_iaf)[::-1])
_ = axs[1].set_title("Sorted inverse applicant frequency")

## iccr
_ = axs[2].plot(np.sort(split_comments_iccr)[::-1])
_ = axs[2].set_title("Sorted inverse comittee concensus rate")

## icr
_ = axs[3].plot(np.sort(split_comments_ccr)[::-1])
_ = axs[3].set_title("Sorted comittee concensus rate")

In [None]:
for i, idx in enumerate(np.argsort(split_comments_uniqueness)[::-1]):
#     if split_comments_iaf[idx] > 7:
#         continue
    
    print(
        "{} {}, uniqueness: {:.3f}, iaf: {:.3f}, iccr, {:.3f}, ccr: {:.3f}".format(
        i, 
        split_comments[idx], 
        split_comments_uniqueness[idx], 
        split_comments_iaf[idx], 
        split_comments_iccr[idx],
        split_comments_ccr[idx],
    ))

In [None]:
for i, idx in enumerate(np.argsort(split_comments_iaf)[::-1]):
    print(
        "{} {}, uniqueness: {:.3f}, iaf: {:.3f}, cr: {:.3f}".format(
        i, 
        split_comments[idx], 
        split_comments_uniqueness[idx], 
        split_comments_iaf[idx], 
        split_comments_ccr[idx]
    ))

## Filter comment by inverse applicant frequency

In [None]:
iaf_threshold = 5

In [None]:
iaf_filtered_comment_dict = defaultdict(list)
iaf_sc_cnt = 0

for _, row in df_split_comments.iterrows():
    sc = row['split_comment']

    ## get iaf value
    iaf = split_comments_iaf_dict[sc]
    if iaf < iaf_threshold:
        continue

    iaf_sc_cnt += 1
        
    original_comment = row['original_comment']
    for oc in original_comment:
        committee = oc[0]
        iaf_filtered_comment_dict[committee].append(sc)
        
iaf_sc_cnt

In [None]:
len(iaf_filtered_comment_dict)

In [None]:
iaf_filtered_comment = []

for _, row in df_comments.iterrows():
    _year = row['year']
    _id = row['id']
    _committee_number = row['committee_number']
    
    query = (_year, _id, _committee_number)
    iaf_f_comment = "，".join(iaf_filtered_comment_dict[query])
    if len(iaf_f_comment) > 0:
        iaf_f_comment += "。"
        print(iaf_f_comment)
        
    iaf_filtered_comment.append(iaf_f_comment)

In [None]:
df_comments['iaf_filtered_comment'] = iaf_filtered_comment

In [None]:
df_comments.head()

In [None]:
D.write_df_comments(df_comments, file='csv')
D.write_df_comments(df_comments, file='pkl')