In [1]:
import os
import json
import pandas as pd
from math import log
from tqdm import tqdm
from kss import split_sentences
from transformers import ElectraModel, ElectraTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from pprint import pprint

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
tokenizer = ElectraTokenizer.from_pretrained('brainelectra-base-discriminator')
max_length = 512

In [4]:
paragraph_data_dir = '../valid_data_question_seperated_paragraph/'
data_list = os.listdir(paragraph_data_dir)
tf_vocab = dict()
tf_table = dict()
paragraph_keywords = []
word_book = dict()
valid_id = []

In [5]:
def tf(t, d):
	return d.count(t)

def idf(df):
	N = len(df) + 1
	res = []
	cnt = 0
	for v in df:
		for i in df[v]:
			cnt += i > 0
		res.append(log(N / (cnt + 1)))
		cnt = 0
	return res

In [6]:
def _extract_words(input_ids):
	words = []
	for vectorized_text in input_ids['input_ids']:
		for d in vectorized_text:
			word = tokenizer.decode(d)
			if word == '[ C L S ]' or word == '[ S E P ]': continue
			if word.find('#') < 0 and len(word) > 1:
				words.append(word)
	return words

In [7]:
def extract_words(content: str) -> list:
	input_ids = tokenizer(content, return_tensors='pt', padding=True, truncation=True, max_length=max_length)
	return _extract_words(input_ids)

In [8]:
def make_tf_vocab():
	for d in tqdm(data_list, desc='TF_VOCAB_LISTING'):
		file_name = paragraph_data_dir + d
		with open(file_name) as file:
			json_file = json.load(file)
			id = json_file['id']
			if id not in tf_vocab.keys():
				tf_vocab[id] = []
			tf_vocab[id].extend(extract_words(content=json_file['answer']))
	for k in tqdm(tf_vocab.keys(), desc='TF_VOCAB_UINQUE'):
		tf_vocab[k] = set(tf_vocab[k])
		tf_vocab[k] = list(tf_vocab[k])
	with open('tf_vocab.json', 'w') as f:
		json.dump(tf_vocab, f, ensure_ascii=False, indent=4, sort_keys=True)

In [9]:
make_tf_vocab()

TF_VOCAB_LISTING: 100%|██████████| 31268/31268 [01:07<00:00, 463.17it/s]
TF_VOCAB_UINQUE: 100%|██████████| 6597/6597 [00:00<00:00, 42011.31it/s]


In [10]:
for d in tqdm(data_list, desc='TF'):
	file_name = paragraph_data_dir + d
	with open(file_name) as file:
		json_file = json.load(file)
		id = json_file['id']
		if id not in tf_table.keys():
			tf_table[id] = []
		docs = extract_words(content=json_file['answer'])

		# TF
		tf_table[id].append([])
		for t in tf_vocab[id]:
			tf_table[id][-1].append(tf(t, docs))

	

TF: 100%|██████████| 31268/31268 [01:15<00:00, 412.36it/s]


In [16]:
for d in tqdm(data_list, desc='TF-IDF'):
	keyword_json = dict()
	file_name = paragraph_data_dir + d
	with open(file_name) as file:
		json_file = json.load(file)
		id = json_file['id']
		keyword_json = json_file
		keyword_json['keywords'] = []
		
		if id not in valid_id:
			cur = 0
			valid_id.append(id)
			# IDF
			tfidf = pd.DataFrame(tf_table[id], columns=tf_vocab[id])
			idf_ = pd.DataFrame(idf(tfidf), index=tf_vocab[id], columns=["IDF"])
			# TF-IDF
			for c in tfidf:
				for i in range(len(tfidf)):
					tfidf[c] *= float(idf_.loc[c] + 1)

		max_val = tfidf.loc[cur].max()
		for c in tfidf:
			if tfidf.loc[cur][c] == max_val:
				keyword_json['keywords'].append(c)
		paragraph_keywords.append(keyword_json)
	cur += 1


TF-IDF: 100%|██████████| 31268/31268 [37:54<00:00, 13.75it/s] 


In [17]:
with open('tf-idf_keyword_paragraph.json', 'w') as f:
	json.dump(paragraph_keywords, f, ensure_ascii=False, indent=4, sort_keys=True)