블로그 리뷰 10개 샘플 가지고 시험

In [None]:
from tqdm.notebook import tqdm
import re
from collections import Counter
import math

with open('./data/lib1.txt', 'r', encoding='utf-8') as f:
    txt = f.read()

In [None]:
# remove tab
txt_clean = re.sub(r'\s+', ' ', txt).strip()
# remove NBSP
txt_clean = txt_clean.replace('\xa0', ' ')
# remove invis char
txt_clean = txt_clean.replace('\u200b', '')
# remove ZWSP
txt_clean = re.sub(r'[\u200B]', '', txt_clean)
# print(txt_clean)

In [None]:
from transformers import pipeline

classifier = pipeline(
    "sentiment-analysis",
    model="WhitePeak/bert-base-cased-Korean-sentiment",
)

In [None]:
import spacy

# small model
nlp = spacy.load("ko_core_news_sm")
# 데탑용 large model (nvidia-smi)
# nlp = spacy.load("ko_core_news_lg")

nlp.add_pipe("sentencizer")  # rule-based

In [None]:
doc = nlp(txt_clean)

In [None]:
[print(l) for l in [sent.text for sent in doc.sents][:10]]
print()

In [None]:
# Apply to a list of reviews
doc_sents = [sent.text for sent in doc.sents]
sentiment_cat = [('positive' if classifier(sent)[0]['label'] == 'LABEL_1' else 'negative', sent)
                 for sent in tqdm(doc_sents)]

블로그 내용 자체의 연과성과 문장 분리부터 좀 불안한 관계로 sentiment analysis또한 일관적이지가 않은 편임.
sentiment analysis 사용 안하기로 판정

In [None]:
sentiment_cat[:10] , sentiment_cat[10:]

---

단순 count기반 키워드 추출

In [None]:
from konlpy.tag import Okt
okt = Okt()

def preprocess_txt(corpus: str):
    # norm=True applies orthographic normalization
    # stem=True reduces to word stem
    lemmas = [(token, pos) for token, pos in okt.pos(corpus, norm=True, stem=True)]
    # 특수문자 제거
    lemmas_clean = [(re.sub(r'[^가-힣a-zA-Z\s]', '', t), p) for t, p in lemmas]
    # 한개짜리 단어 제거거
    lemmas_clean = [(t, p) for t, p in lemmas_clean if len(t) > 1]

    return lemmas_clean

In [None]:
lemmas_clean = preprocess_txt(txt_clean)
lemmas_clean[:25]

In [None]:
poses = Counter([l[1] for l in lemmas_clean])
poses

In [None]:
use_pos = [  # 문장성분 추려내기
    'Noun',
    'Verb',
    'Adjective',
    # 'Adverb',
    'Hashtag',
]
lemmas_clean_fin = [kw for kw, pos in lemmas_clean
                    if pos in use_pos]
len(lemmas_clean_fin)

In [None]:
word_list = [(*okt.pos(w)[0], c)
             for w, c in Counter(lemmas_clean_fin).items()
             if c > 1]  # 너무 적은 것들은 포함 안함.

[(*okt.pos(w)[0], c)
 for w, c in Counter(lemmas_clean_fin).most_common(25)]

TF-IDF통해서 키워드 추출

블로그 하나를 document로 정의

블로그별 top n단어 추출해서 집합하기?

```python
target = '목표 단어'
TF = Counter(blogs['blog A'])['target']  # target count for blog A
IDF = log(len(blogs) / (1 + len([blog for blog in blogs if target in blog])))

# tf-idf for blog A is:
TF * TDF
```

나중에는 집합한 TF-IDF들에 대하여 또 TF-IDF를 계산해서 도서관별 돋보이는 keyword 찾을수도? (충분히 vary한다면 될 듯)

In [None]:
# 그동안 한것들 pipeline으로
def preprocess_txt_ex(corpus: str):
    lemmas = [(token, pos) for token, pos in okt.pos(corpus, norm=True, stem=True)]
    lemmas_clean = [(re.sub(r'[^가-힣a-zA-Z\s]', '', t), p) for t, p in lemmas]
    lemmas_clean = [(t, p) for t, p in lemmas_clean if len(t) > 1]

    use_pos = [
        'Noun',
        'Verb',
        'Adjective',
        'Hashtag',
    ]
    lemmas_clean = [kw for kw, pos in lemmas_clean
                    if pos in use_pos]

    word_counts = {w: c
                   for w, c in Counter(lemmas_clean).items()
                   if c > 1}

    return word_counts

In [None]:
# TF-IDF를 위한 블로그별 그룹화
txt_clean_grouped = txt_clean.split('[STOP]')[:-1]  # [STOP] 기준 블로그 delimeter 쪼개기 후, 마지막 꼬투리 제거
blogs_clean_grouped = [preprocess_txt_ex(c) for c in txt_clean_grouped]
print('no. group:', len(blogs_clean_grouped))

In [None]:
# 위에꺼들 pipeline으로 만들기
# 블로그별 상위 n개만 저장
tfidf_top = list()

_word_list = list(set(  # 중복제거
    [k for blog in blogs_clean_grouped for k, _ in blog.items()]
))

df_scores = [(w, len([True for blog in blogs_clean_grouped if w in blog])) for w in _word_list]

doc_len = len(blogs_clean_grouped)
idf_scores = [(w, math.log(doc_len / (1 + df)) ) for w, df in df_scores]

total_tf_idf_top = list()
for blog in blogs_clean_grouped:  # 각 document(blog)에 대한 TF-IDF점수를 구할 수 있음.
    tf_scores = [(w, blog.get(w, 0)) for w in _word_list]
    tfidf_scores = [(tf[0], tf[1] * idf[1]) for tf, idf in zip(tf_scores, idf_scores) if tf[1] > 0]

    tfidf_scores_top = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:10]  # top 10
    total_tf_idf_top += tfidf_scores_top

total_tf_idf_top.sort(key=lambda x: x[1], reverse=True)
[print(l) for l in total_tf_idf_top[:25]]
print()