In [None]:
import pandas as pd
import numpy as np
import os
from collections import defaultdict
import matplotlib.pyplot as plt
from itertools import product, chain
from joblib import Parallel, delayed

from tqdm import tqdm
tqdm.pandas(desc="progress: ")

from importlib import reload
import pickle

# Utility variable
import sys, getopt
sys.path.insert(0, '../..')

# var
import var.var as V
import var.path as P

# utils
import utils.data as D
import utils.io as IO
import utils.preprocess as PP
import utils.torch as Tor

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
os.environ['TOKENIZERS_PARALLELISM']= 'false'

## Process Command Line Arguments

In [None]:
opts, args = getopt.getopt(sys.argv[1:], "ab:s:e:f:r:n:d")

In [None]:
TRAIN_OR_ALL = 'train'
BATCH_SIZE = 70
START_IDX = 2080
END_IDX = 2091
RADIUS = 7.5
N_NEIGHBORS = 30
debug = False

for opt, arg in opts:
    if opt == '-a':
        TRAIN_OR_ALL = 'all'
    elif opt == '-b':
        BATCH_SIZE
    elif opt == '-s':
        START_IDX = int(arg)
    elif opt == '-e':
        END_IDX = int(arg)
    elif opt == '-r':
        RADIUS = float(arg)
    elif opt == '-n':
        N_NEIGHBORS = int(arg)
    elif opt == '-f':
        debug = True
    elif opt == '-d':
        debug = True

## Read data
- need to know which comment chunk belong to which applicant

In [None]:
df_applicants = D.read_df_applicants(TRAIN_OR_ALL)
df_comments = D.read_df_comments()
df_split_comments = D.read_df_split_comments_no_duplicate(TRAIN_OR_ALL)
split_comments = D.read_split_comments_no_duplicate(TRAIN_OR_ALL)

In [None]:
split_comment_to_id = {sc: idx for idx, sc in zip(df_split_comments['split_comment'].index, df_split_comments['split_comment'].values)}

## Find original applicant for each split comment

In [None]:
# %%time
sc_applicant_lists = df_split_comments['applicants']
sc_committee_lists = df_split_comments['committee']

## Load the  embedding and the topics of each split comment

In [None]:
from bertopic import BERTopic
import utils.bertopic as BT

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
_pass = BT._pass
topic_doc_tokenizer = BT.topic_doc_tokenizer
vectorizer = CountVectorizer(tokenizer=topic_doc_tokenizer, lowercase=False)

In [None]:
SBERT_MODEL_NAME = 'ckiplab/bert-base-chinese'

if TRAIN_OR_ALL == 'train':
    BERTOPIC_MODEL_NAME = "BERTopic_custom_mcs_100_ckip_diversified_low_train"
elif TRAIN_OR_ALL == 'all':
    BERTOPIC_MODEL_NAME = "BERTopic_custom_mcs_100_ckip_diversified_low_all"
    
SPLITTER = '＄'

In [None]:
topic_model = BERTopic.load(os.path.join(P.FP_COMMENT_CLUSTERING_MODEL_DIR, BERTOPIC_MODEL_NAME))
print("Load BERTopic model success.")

In [None]:
sbert_model = topic_model.embedding_model.embedding_model

In [None]:
df_tokenization_database = df_split_comments

In [None]:
sentence_bert = topic_model.embedding_model.embedding_model

In [None]:
split_comments_embeds = sentence_bert.encode(split_comments, show_progress_bar=False)

In [None]:
reduced_split_comments_embeds = topic_model.umap_model['umap'].embedding_
reduced_split_comments_embeds = topic_model.umap_model['norm'].transform(reduced_split_comments_embeds)
reduced_split_comments_embeds.shape

In [None]:
import hdbscan

In [None]:
_, probs = hdbscan.approximate_predict(
    topic_model.hdbscan_model, reduced_split_comments_embeds
)
topics = topic_model.hdbscan_model.labels_

topics = topic_model._map_predictions(topics)
probs = topic_model._map_probabilities(probs, original_topics=True)
topic_labels = topics

In [None]:
def get_topic(s):
    idx = split_comments.index(s)
    return topics[idx]

In [None]:
import torch

In [None]:
from sentence_transformers.util import cos_sim

In [None]:
# %%time
split_comments_sim_mat = cos_sim(split_comments_embeds, split_comments_embeds)

## Calculate chunk consensus for each applicant

In [None]:
chunk_row_data_list = []

for _, row in df_split_comments.iterrows():
    sc = row['split_comment']
    committee = row['committee']
    
#     print(sc, committee)
    
    for com in committee:
        chunk_row_data = {
            "year": com[0],
            "id": com[1],
            "committee_number": com[2],
            "split_comment": sc
        }
        chunk_row_data_list.append(chunk_row_data)

In [None]:
df_chunk = pd.DataFrame(chunk_row_data_list)
df_chunk.head()

## Find the committee that does not write comments

In [None]:
df_comment_committee_group = df_comments.groupby(['year', 'group', 'committee_number'])

In [None]:
empty_comment_rate_threshold = 0.9

In [None]:
committee_empty_comment_rate_dict = {}
empty_comment_committee_list = []

for committee, g in df_comment_committee_group:
    comment_cnt = g.shape[0]
    
    empty_comment_cnt = 0
    for comment in g['comment']:
        if PP.is_empty_sent(comment):
            empty_comment_cnt += 1
    
    empty_comment_rate = empty_comment_cnt / comment_cnt
    print(committee, "empty_comment_rate: {:.3f}".format(empty_comment_rate))
    committee_empty_comment_rate_dict[committee] = empty_comment_rate
    
    if empty_comment_rate > empty_comment_rate_threshold:
        empty_comment_committee_list.append(committee)
    
#     print(g)

In [None]:
sorted(committee_empty_comment_rate_dict.items(), key=lambda item: -item[1])

In [None]:
empty_comment_committee_list

## Calculate the number of committee per group

In [None]:
df_applicant_group = df_comments.groupby(['year', 'group'])

In [None]:
app_group_committee_count = {}

for app_group, g in df_applicant_group:
#     num_committee = g.groupby(['committee_number']).ngroups
    group_committee = g.groupby(['committee_number']).groups.keys()
    num_committee = sum([1 for com in group_committee if (*app_group, com) not in empty_comment_committee_list ])
    
    app_group_committee_count[app_group] = num_committee

In [None]:
app_group_committee_count

## Find nearest neighbors 

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
neigh = NearestNeighbors(metric='minkowski') ## or 'cosine'
neigh.fit(split_comments_embeds)

In [None]:
RADIUS = 7.5
N_NEIGHBORS = 30

In [None]:
# %%time
r_neigh_dist, r_neighbor_ind = neigh.radius_neighbors(split_comments_embeds, RADIUS)

In [None]:
# # %%time
k_neigh_dist, k_neighbor_ind = neigh.kneighbors(split_comments_embeds, N_NEIGHBORS)

In [None]:
k_neighbor_ind

In [None]:
neigh_count_within_r = np.array([len(neighbor_ind) for neighbor_ind in r_neighbor_ind])

In [None]:
mean_neigh_count_within_r = np.mean(neigh_count_within_r)

In [None]:
plt.plot(np.sort(neigh_count_within_r))

In [None]:
# for i, idx in enumerate(np.argsort(neigh_count_within_r)):
#     print(i, split_comments[idx])
#     print(len(r_neighbor_ind[idx]))
    
#     for nidx in r_neighbor_ind[idx][-10:]:
#         print('    ', split_comments[nidx])

## Combine radius neighbors and k nearest neighbors

In [None]:
sc_neighbor_distance = []
sc_neighbor_index = []

for rnd, rni, knd, kni in zip(r_neigh_dist, r_neighbor_ind, k_neigh_dist, k_neighbor_ind):
    ## apply k nearest neighbors
    if len(rni) < N_NEIGHBORS:
        sc_neighbor_distance.append(knd)
        sc_neighbor_index.append(kni)
    ## apply radius based neighbors
    else:
        sc_neighbor_distance.append(rnd)
        sc_neighbor_index.append(rni)

In [None]:
neighbor_count = np.array([len(nind) for nind in sc_neighbor_index])

## Aggregate the referred applicants of all neighbors

In [None]:
# %%time
print("aggregate the referred applicants of all neighbors...")

applicants_of_neighbor = []
committees_of_neighbor = []

for nind in tqdm(sc_neighbor_index):
    applicants = set()
    committees = set()
    
    for nidx in nind:
        sc_applicants = sc_applicant_lists.iloc[nidx]
        sc_committees = sc_committee_lists.iloc[nidx]
        
        for app in sc_applicants:
            applicants.add(app)
            
        for com in sc_committees:
            committees.add(com)
            
    applicants_of_neighbor.append(applicants)
    committees_of_neighbor.append(committees)

In [None]:
def get_neighbor(sc, debug=False):
    idx = split_comments.index(sc)
    print(idx)
    row = df_chunk.query("`split_comment` == @sc")
    print(row)
    
    print("split comment:", split_comments[idx])
    print("Neighbors:")
    
    for nidx in sc_neighbor_index[idx]:
        if debug:
            print("\"{}\",".format(split_comments[nidx]))
        else:
#             print('  ', split_comments[nidx])
            print('  ', nidx, sc_applicant_lists.iloc[nidx], split_comments[nidx])

## Calculate uniqueness score
- inverse applicant frequency
- consensus rate

In [None]:
num_applicants = df_applicants.shape[0]
df_comment_applicant_group = df_comments.groupby(['year', 'id'])

In [None]:
app_committee_count = {}

for _, row in df_comments.iterrows():
    app = (row['year'], row['id'])
    committee_count = app_group_committee_count[(row['year'], row['group'])] 
    
    app_committee_count[app] = committee_count

In [None]:
print("calculate uniqueness score...")

split_comments_uniqueness = []
split_comments_iaf = []
split_comments_iccr = []
split_comments_ccr = []

for idx, (neigh_app, neigh_com) in tqdm(enumerate(zip(applicants_of_neighbor, committees_of_neighbor))):
    ## inverse applicant frequency
    iaf = np.log(num_applicants / len(neigh_app))
    ## conmittee consensus rate
    all_hit_applicant_committee = sum([
        app_committee_count[app] for app in neigh_app
    ])
    mention_hit_applicant_committee = len(neigh_com)
    ccr = mention_hit_applicant_committee / all_hit_applicant_committee
    iccr = np.log(ccr) ** -1
    
    
    uniqueness = iaf * iccr
#     uniqueness = iaf * ccr
    
    split_comments_uniqueness.append(uniqueness)
    split_comments_iaf.append(iaf)
    split_comments_iccr.append(iccr)
    split_comments_ccr.append(ccr)

### Plot uniqueness

In [None]:
if debug:
    fig, axs = plt.subplots(3, 1, figsize=(4, 7), constrained_layout=True)

    ## Uniqueness
    _ = axs[0].plot(np.sort(split_comments_uniqueness)[::-1])
    _ = axs[0].set_title("Sorted uniqueness")

    ## iaf
    _ = axs[1].plot(np.sort(split_comments_iaf)[::-1])
    _ = axs[1].set_title("Sorted inverse applicant frequency")

    ## icr
    _ = axs[2].plot(np.sort(split_comments_ccr)[::-1])
    _ = axs[2].set_title("Sorted comittee concensus rate")

In [None]:
if debug:
    for i, idx in enumerate(np.argsort(split_comments_uniqueness)[::-1]):
        print(
            "{} {}, uniqueness: {:.3f}, iaf: {:.3f}, cr: {:.3f}".format(
            i, 
            split_comments[idx], 
            split_comments_uniqueness[idx], 
            split_comments_iaf[idx], 
            split_comments_ccr[idx]
        ))

In [None]:
if debug:
    for i, idx in enumerate(np.argsort(split_comments_iaf)[::-1]):
        print(
            "{} {}, uniqueness: {:.3f}, iaf: {:.3f}, cr: {:.3f}".format(
            i, 
            split_comments[idx], 
            split_comments_uniqueness[idx], 
            split_comments_iaf[idx], 
            split_comments_ccr[idx]
        ))

## Calculate the uniqueness score of each sentence inside application

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
print("start calculate uniqueness score...")

In [None]:
df_applicants = D.read_df_applicants()
df_applications = D.read_df_applications()
test_df = pd.read_csv("112_F.csv")

In [None]:
df_applicants = pd.concat([df_applicants, test_df])
df_applicants

In [None]:
df_applications = pd.merge(
    df_applications, df_applicants[['year', 'id', 'name']], how='left', on=['year', 'id']
)

In [None]:
df_applications.head()

In [None]:
df_applications.name = df_applications.name.fillna('?')

In [None]:
tuples = df_applications.apply(lambda row: (row['year'], row['id'], row['name']), axis=1).to_list()

In [None]:
tuples = [{
    'year': info[0],
    'id': info[1],
    'name': info[2],
} for info in tuples]

In [None]:
def dict_info_to_tuple_info(dict_info):
    _year = dict_info['year']
    _id = dict_info['id']
    _name = dict_info['name']
    tuple_info = (_year, _id, _name)
    
    return tuple_info

In [None]:
tuples = tuples[START_IDX:END_IDX]

In [None]:
if debug:
    tuples = [
        "# The content is removed due to confidential concerns."
    ]

In [None]:
len(tuples)

In [None]:
df_achievements = D.read_df_achievements()
df_recommendation_letters = D.read_df_recommendation_letters()

In [None]:
def get_chunks_and_sents_from_data_sheet(_year, _id):
    row = df_achievements.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        chunks = row['achievement'].to_list()
        ## [TODO] deal with nan achievement result
        sents = ["{}，{}".format(a, r) for a, r in 
                 zip(row['achievement'].to_list(), row['achievement_result'].to_list())]
    except:
        chunks = []
        sents = []
        
    if chunks == None:
        chunks = []
    if sents == None:
        sents = []
        
    return chunks, sents

In [None]:
def get_chunks_and_sents_from_self_statement(_year, _id):
    row = df_applications.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        chunks = row['self_statement_chunk'].to_list()[0]
        sents = row['self_statement_sent'].to_list()[0]
    except:
        chunks = []
        sents = []

    if chunks == None:
        chunks = []
    if sents == None:
        sents = []
        
    return chunks, sents

In [None]:
def get_chunks_and_sents_from_recommendation_letter(_year, _id):
    rows = df_recommendation_letters.query('`year` == {} and `id` == {}'.format(_year, _id))
    
    try:
        rls_chunks = rows['all_paragraph_chunk'].to_list()
        rls_sents = rows['all_paragraph_sent'].to_list()
    except:
        rls_chunks = []
        rls_sents = []
        
    if rls_chunks == None:
        rls_chunks = []
    if rls_sents == None:
        rls_sents = []
        
    chunks = list(chain.from_iterable(rls_chunks))
    sents = list(chain.from_iterable(rls_sents))
            
    return chunks, sents

In [None]:
def get_topic_prediction(topic_model, chunks, n_neighbors, method="k", radius=0.02):
    if method == "k":
        neigh = KNeighborsClassifier(n_neighbors=n_neighbors)
        neigh.fit(reduced_split_comments_embeds, topic_labels)
    elif method == "r":
        neigh = KNeighborsClassifier(radius=radius, outlier_label=-1)
        neigh.fit(reduced_split_comments_embeds, topic_labels)
        
    ## get reduce chunk embeddings
    chunk_embeds = topic_model.embedding_model.embed(chunks)
    
#     dataset = Tor.ListDataset(chunk_embeds)
#     dataloader = DataLoader(dataset, batch_size=32, num_workers=8)
    
#     chunk_reduced_embeds = np.vstack(Parallel(n_jobs=24)(delayed(topic_model.umap_model.transform)(batch) for batch in dataloader))
    chunk_reduced_embeds = topic_model.umap_model.transform(chunk_embeds)
    ## predict topic and confidence
    predicted_topics = neigh.predict(chunk_reduced_embeds)
    predicted_confs = neigh.predict_proba(chunk_reduced_embeds)
    predicted_neighbors_idx = neigh.kneighbors(chunk_reduced_embeds, n_neighbors=n_neighbors, return_distance=False)
    
    return predicted_topics, predicted_confs, predicted_neighbors_idx

In [None]:
def calculate_uniqueness_score(chunks, predicted_neighbors_sc_idx):
    chunks_embed = sbert_model.encode(chunks, batch_size=128, show_progress_bar=False)
    uniqueness_score = []
    iaf_score = []
    ccr_score = []
    iccr_score = []
    
    for chunk_embed, pred_neigh_idx in zip(chunks_embed, predicted_neighbors_sc_idx):
#         ## filter out negative comments
#         outliers_idx = [_idx for _idx in outliers_idx if split_comments_sentiment[_idx] == 1]
#         outliers = [split_comments[_idx] for _idx in outliers_idx]
        
        ## find neighbors and their corresponding uniqueness score
        neighbors = [split_comments[idx] for idx in pred_neigh_idx]
        neighbors_uniquenss = [split_comments_uniqueness[idx] for idx in pred_neigh_idx]
        neighbors_iaf = [split_comments_iaf[idx] for idx in pred_neigh_idx]
        neighbors_ccr = [split_comments_ccr[idx] for idx in pred_neigh_idx]
        neighbors_iccr = [split_comments_iccr[idx] for idx in pred_neigh_idx]
        ## calculate the semantic similarity between chunk and neighbor as uniqueness weight
        neighbors_embed = sbert_model.encode(neighbors, batch_size=128, show_progress_bar=False)
        uniqueness_weight = cos_sim(chunk_embed, neighbors_embed).reshape(-1)
        ## calculate chunk uniqueness with uniqueness weight
        chunk_uniqueness = np.dot(neighbors_uniquenss, uniqueness_weight)
        chunk_iaf = np.dot(neighbors_iaf, uniqueness_weight)
        chunk_ccr = np.dot(neighbors_ccr, uniqueness_weight)
        chunk_iccr = np.dot(neighbors_iccr, uniqueness_weight)
        
        ## [TODO] append iaf and cr
        uniqueness_score.append(chunk_uniqueness)
        iaf_score.append(chunk_iaf)
        ccr_score.append(chunk_ccr)
        iccr_score.append(chunk_iccr)
        
    return uniqueness_score, iaf_score, ccr_score, iccr_score

In [None]:
def calculate_candidate_sents_score(
    sents,
    chunks,
    topic_model,
    n_neighbors = 25
):
    if chunks == None or len(chunks) == 0:
        return {
            "sents": [],
            "chunks": [],
            "uniqueness_score": [],
            "iaf_score": [],
            "ccr_score": [],
            "iccr_score": [],
        }

    _, _, predicted_neighbors_sc_idx = get_topic_prediction(
        topic_model, chunks, n_neighbors
    )
    
    uniqueness_score, iaf_score, ccr_score, iccr_score = calculate_uniqueness_score(chunks, predicted_neighbors_sc_idx)
    
    return {
        "sents": sents,
        "chunks": chunks,
        "uniqueness_score": uniqueness_score,
        "iaf_score": iaf_score,
        "ccr_score": ccr_score,
        "iccr_score": iccr_score,
    }

In [None]:
def find_summary_candidate_pipe(info, get_chunks_and_sents_and_refs_func, debug=False):
    ## get basic info
    _year = info['year']
    _id = info['id']
    _name = info['name']
    idx = (_year, _id, _name)
    
#     print(idx)
    
    ## get chunks and sents
    chunks, sents = get_chunks_and_sents_and_refs_func(_year, _id)
    ## [TODO] calculate importance score for each summary
    chunk_debug_info = calculate_candidate_sents_score(
        sents, chunks, topic_model
    )
    
    return chunk_debug_info

In [None]:
def merge_chunk_debug_info(old_info, new_info):
    ## if old info is empty, return new info
    if old_info == {} or old_info['chunks'] == []:
        return new_info
    
    ## if new info is empty, return old info
    if new_info['chunks'] == []:
        return old_info
    
    info = {}
    
    info['sents'] = np.concatenate((old_info['sents'], new_info['sents']))
    info['chunks'] = old_info['chunks'] + new_info['chunks']
    info['uniqueness_score'] = np.concatenate((old_info['uniqueness_score'], new_info['uniqueness_score']))
    info['iaf_score'] = np.concatenate((old_info['iaf_score'], new_info['iaf_score']))
    info['ccr_score'] = np.concatenate((old_info['ccr_score'], new_info['ccr_score']))
    info['iccr_score'] = np.concatenate((old_info['iccr_score'], new_info['iccr_score']))
    
    return info

In [None]:
chunk_debug_info_buffer = defaultdict(dict)

In [None]:
IO.print_dividing_line()
IO.print_dividing_line("Processing data sheet ...")

for dict_info in tqdm(tuples):
    idx = dict_info_to_tuple_info(dict_info)
    chunk_debug_info = find_summary_candidate_pipe(
        dict_info, get_chunks_and_sents_from_data_sheet
    )

    chunk_debug_info_buffer[idx] = merge_chunk_debug_info(chunk_debug_info_buffer[idx], chunk_debug_info)
#     IO.print_dividing_line()

In [None]:
IO.print_dividing_line()
IO.print_dividing_line("Processing self-statement ...")

for dict_info in tqdm(tuples):
    idx = dict_info_to_tuple_info(dict_info)
    chunk_debug_info = find_summary_candidate_pipe(
        dict_info, get_chunks_and_sents_from_self_statement
    )

    chunk_debug_info_buffer[idx] = merge_chunk_debug_info(chunk_debug_info_buffer[idx], chunk_debug_info)
#     IO.print_dividing_line()

In [None]:
IO.print_dividing_line()
IO.print_dividing_line("Processing recommendation letter ...")

for dict_info in tqdm(tuples):
    idx = dict_info_to_tuple_info(dict_info)
    chunk_debug_info = find_summary_candidate_pipe(
        dict_info, get_chunks_and_sents_from_recommendation_letter
    )

    chunk_debug_info_buffer[idx] = merge_chunk_debug_info(chunk_debug_info_buffer[idx], chunk_debug_info)
#     IO.print_dividing_line()

In [None]:
if debug:
    chunk_debug_info_buffer

In [None]:
for _, _dict in chunk_debug_info_buffer.items():
    assert len(_dict['chunks']) == len(_dict['uniqueness_score'])
    assert len(_dict['uniqueness_score']) == len(_dict['iaf_score'])
    assert len(_dict['iaf_score']) == len(_dict['ccr_score'])
    assert len(_dict['ccr_score']) == len(_dict['iccr_score'])

In [None]:
if debug:
    for info, chunk_uni_dict in chunk_debug_info_buffer.items():
        print(info)

        chunks = chunk_uni_dict['chunks']
        uniqueness_score = chunk_uni_dict['uniqueness_score']

        for idx in np.argsort(uniqueness_score)[::-1]:
            print(chunks[idx], uniqueness_score[idx])

        IO.print_dividing_line()

In [None]:
## store data
if debug:
    fn = "{}_chunk_uniqueness_debug.pkl".format(TRAIN_OR_ALL)
    _dir = os.path.join(P.FP_UNIQUENESS_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL, 'all_data')
else:
    fn = "{}_chunk_uniqueness_{:04d}_to_{:04d}.pkl".format(TRAIN_OR_ALL, START_IDX, END_IDX)
    _dir = os.path.join(P.FP_UNIQUENESS_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL, 'all_data')

if not os.path.exists(_dir):
    os.makedirs(_dir)

all_data_fp = os.path.join(_dir, fn)

with open(all_data_fp, 'wb') as f:
    pickle.dump(chunk_debug_info_buffer, f)

In [None]:
# chunk load test
if debug:
    fn = "{}_chunk_uniqueness_{:04d}_to_{:04d}.pkl".format(TRAIN_OR_ALL, START_IDX, END_IDX)
    _dir = os.path.join(P.FP_UNIQUENESS_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL, 'all_data')
    
    all_data_fp = os.path.join(_dir, fn)

    with open(all_data_fp, 'rb') as f:
        _dict = pickle.load(f)

In [None]:
fn = "{}_chunk_uniqueness_{:04d}_to_{:04d}.pkl".format(TRAIN_OR_ALL, 0, 100)
_dir = os.path.join(P.FP_UNIQUENESS_PSEUDO_SUMMARY_DIR, 'custom_bertopic', TRAIN_OR_ALL, 'all_data')

all_data_fp = os.path.join(_dir, fn)

with open(all_data_fp, 'rb') as f:
    _dict = pickle.load(f)

In [None]:
_dict