In [17]:
from tqdm.notebook import tqdm
import re
from collections import Counter
import math
import pickle

from sqlalchemy import text
from collections import defaultdict

from konlpy.tag import Okt
okt = Okt()

from utils.DB_connect import get_engine
engine = get_engine()

In [3]:
def preprocess_txt(corpus: str):
    # remove tab
    txt_clean = re.sub(r'\s+', ' ', corpus).strip()
    # remove NBSP
    txt_clean = txt_clean.replace('\xa0', ' ')
    # remove invis char
    txt_clean = txt_clean.replace('\u200b', '')
    # remove ZWSP
    txt_clean = re.sub(r'[\u200B]', '', txt_clean)
    # remove html elements
    txt_clean = re.sub(r'<.*?>', '', txt_clean)

    # norm=True applies orthographic normalization
    # stem=True reduces to word stem
    lemmas = [(token, pos) for token, pos in okt.pos(txt_clean, norm=True, stem=True)]
    # 특수문자 제거
    lemmas_clean = [(re.sub(r'[^가-힣a-zA-Z\s]', '', t), p) for t, p in lemmas]
    # 한개짜리 단어 제거
    lemmas_clean = [(t, p) for t, p in lemmas_clean if len(t) > 1]
    
    use_pos = [
        'Noun',
        #'Verb',
        # 'Adjective',
        'Hashtag',
    ]
    lemmas_clean = [kw for kw, pos in lemmas_clean
                    if pos in use_pos]

    word_counts = {w: c for w, c in Counter(lemmas_clean).items()}

    return word_counts

In [4]:
with engine.connect() as conn:
    data_len = conn.execute(text(
        "SELECT COUNT(*) FROM library_reviews"
    )).all()[0][0]
    result = conn.execute(text(
        "SELECT `library_id`, `review` FROM library_reviews"
    ))

review_processed = [([row[0], preprocess_txt(row[1])]) for row in tqdm(result, total=data_len)]

  0%|          | 0/17313 [00:00<?, ?it/s]

In [11]:
review_processed[0]

[5,
 {'용흥': 1,
  '중앙': 1,
  '용흥동': 2,
  '분양': 2,
  '전환': 1,
  '아파트': 1,
  '가이드': 1,
  '포항시': 1,
  '북구': 1,
  '사업': 1,
  '골프': 1,
  '연습장': 1,
  '도서관': 1,
  '사우나': 1,
  '여러': 1,
  '가지': 1,
  '공간': 1,
  '계획': 1,
  '코인': 1,
  '세탁실': 1}]

In [12]:
def get_tf_idf(docs: list[dict]):
    _word_list = list(set(  # 중복제거
        [k for blog in docs for k, _ in blog.items()]
    ))

    df_scores = [(w, len([True for blog in docs if w in blog])) for w in _word_list]

    doc_len = len(docs)
    idf_scores = [(w, math.log(doc_len / (1 + df)) ) for w, df in df_scores]

    total_tf_idf_top = list()
    for blog in docs:  # 각 document(blog)에 대한 TF-IDF점수를 구할 수 있음.
        tf_scores = [(w, blog.get(w, 0)) for w in _word_list]
        tfidf_scores = [(tf[0], tf[1] * idf[1]) for tf, idf in zip(tf_scores, idf_scores) if tf[1] > 0]

        tfidf_scores_top = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:10]  # top 10
        total_tf_idf_top += tfidf_scores_top

    total_tf_idf_top.sort(key=lambda x: x[1], reverse=True)
    return total_tf_idf_top[:5]

In [13]:
get_tf_idf([v for _, v in review_processed[:5]])

[('중개사', 2.7488721956224653),
 ('공인', 2.7488721956224653),
 ('사무소', 2.7488721956224653),
 ('신태인', 2.7488721956224653),
 ('용흥동', 1.8325814637483102)]

In [14]:
grouped = defaultdict(list)
for lib_id, review in review_processed:
    grouped[lib_id].append(review)

In [25]:
tfidf_per_lib = {i: get_tf_idf(lib_review) for i, lib_review in grouped.items()}

In [27]:
[print(tfidf_per_lib[n]) for n in range(1, 25)]
print()

[('버드', 2.7488721956224653), ('개나리', 1.8325814637483102), ('중앙', 1.8325814637483102), ('잇님들', 0.9162907318741551), ('이번', 0.9162907318741551)]
[('강동', 1.8325814637483102), ('이용', 1.8325814637483102), ('성내', 1.0216512475319814), ('별관', 1.0216512475319814), ('본관', 1.0216512475319814)]
[('설치', 1.8325814637483102), ('참여', 1.8325814637483102), ('부대', 1.8325814637483102), ('해병대', 1.8325814637483102), ('자원봉사', 1.8325814637483102)]
[('여기', 1.8325814637483102), ('서울시', 1.8325814637483102), ('호공', 1.0216512475319814), ('한강', 0.9162907318741551), ('정류장', 0.9162907318741551)]
[('재개발', 1.8325814637483102), ('세대', 1.8325814637483102), ('대수', 1.8325814637483102), ('설명', 1.8325814637483102), ('주민', 1.5324768712979722)]
[('아주', 1.8325814637483102), ('동작', 0.9162907318741551), ('동사무소', 0.9162907318741551), ('부슬부슬', 0.9162907318741551), ('진성', 0.9162907318741551)]
[('얼굴', 2.7488721956224653), ('전쟁', 2.7488721956224653), ('도서', 1.8325814637483102), ('진행', 1.8325814637483102), ('수업', 1.8325814637483102)]
[

In [28]:
with open('./data/tfidf_score.pkl', 'wb') as f:
    pickle.dump(tfidf_per_lib, f)