## util.py

In [10]:
%%writefile ./util.py
import urllib.request
import numpy as np
from tqdm import tqdm

def downloadByURL(url, output_path):
    """
    HTTP 파일 다운로드
    """
    class DownloadProgressBar(tqdm):
        def update_to(self, b=1, bsize=1, tsize=None):
            if tsize is not None:
                self.total = tsize
            self.update(b * bsize - self.n)
        
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)
        
def oneHotEncoding(label, classNum):
    """
    라벨링된 int를 oneHot 인코딩한다
    ex) oneHotEncoding(0, 2) -> [1, 0]
    ex) oneHotEncoding(1, 2) -> [0, 1]
    """
    oneHot = [0]*classNum
    oneHot[label] = 1
    return oneHot

def reshape(series, embedding_dim):
    """
    shape 변경
    """
    result = np.array(series.tolist())
    result = result.reshape(result.shape[0], embedding_dim, 1)
    return result

Overwriting ./util.py


## analysis.py

In [2]:
# %%writefile ./analysis.py
import os
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud

class Analysis():
     
    def __init__(self, data):
        """
        전체 분석
        """
        self.countAnalysis(data)
        print()
        self.textAnalysis(data)
        print()
        self.showWordCloud(data.text)
        
    def countAnalysis(self, data):
        """
        데이터 수량 조사
        """
        
        labeled_data = data.loc[data.label != -1]
        total_count = len(data) # 전체 데이터 수
        labeled_count = len(labeled_data) # 라벨링 된 데이터 수

        print('> 데이터 수량 조사')
        print(f'전체 데이터 수: {total_count}개')
        print(f'라벨링된 데이터 수: {labeled_count}개')
        for label, count in data.label.value_counts().iteritems():
            print(f'class {label} : {count}개')
    
    def textAnalysis(self, data):
        """
        text 길이 분석
        """
        text_len = data.text.apply(len)
        plt.figure(figsize=(12, 5))
        plt.hist(text_len, bins=200, alpha=0.5, color= 'r', label='length of text')
        plt.legend(fontsize='x-large')
        plt.yscale('log', nonposy='clip')
        plt.title('Log-Histogram of length of text')
        plt.xlabel('Length of text')
        plt.ylabel('Number of text')

        print('> 문장 길이 분석')
        print('문장 길이 최대 값: {}'.format(np.max(text_len)))
        print('문장 길이 최소 값: {}'.format(np.min(text_len)))
        print('문장 길이 평균 값: {:.2f}'.format(np.mean(text_len)))
        print('문장 길이 표준편차: {:.2f}'.format(np.std(text_len)))
        print('문장 길이 중간 값: {}'.format(np.median(text_len)))

        # 사분위의 대한 경우는 0~100 스케일로 되어있음
        print('문장 길이 제 1 사분위: {}'.format(np.percentile(text_len, 25)))
        print('문장 길이 제 3 사분위: {}'.format(np.percentile(text_len, 75)))
            
    def showWordCloud(self, text):
        """
        WordCloud
        """
        # 한글 폰트 깨짐방지
        for font in ["/Library/Fonts/NanumGothic.ttf", "/Library/Fonts/NotoSansCJKkr-Light.otf"]:
            if os.path.isfile(font):
                FONT_PATH = font
                break
        cloud = WordCloud(font_path=FONT_PATH).generate(" ".join(text))
        plt.figure(figsize=(20, 15))
        plt.imshow(cloud)
        plt.axis('off')

## document.py
- [awesome-devblog : feeds](https://awesome-devblog.now.sh/api/korean/people/feeds)

In [2]:
%%writefile ./document.py
import os, re, csv, requests, json
import numpy as np
import pandas as pd
from enum import Enum
from tqdm import trange
from bs4 import BeautifulSoup

class KEYS(Enum):
    # -1 : 아직 라벨링 안함 (default)
    # 0  : 개발과 관련없는 문서
    # 1  : 개발과 관련있는 문서
    LABEL = 'label'
    
    # TAGS + TITLE + DESC
    TEXT = 'text'
    
    # DATA_URL 결과 파싱용 Keys(Beans)
    ID = '_id'
    TITLE = 'title'
    DESC = 'description'
    TAGS = 'tags'
    LINK = 'link'
    
    def getDocKeys():
        return [KEYS.ID.value, KEYS.TITLE.value, KEYS.DESC.value, KEYS.TAGS.value, KEYS.LINK.value]
    
    def getTitleBlackList():
        return ['', 'about']
    
    def getTextKeys():
        return [KEYS.TAGS.value, KEYS.TITLE.value, KEYS.DESC.value]

class Document():
    
    def __init__(self, update=False):
        
        # Constant
        self.DATA_URL = 'https://awesome-devblog.now.sh/api/korean/people/feeds'
        self.DOCUMENTS_PATH = './data/documents.csv'
        self.MAX_REQ_SIZE = 5000
        
        # 기본 폴더 생성
        for path in ['./data', './model', './wv_model']:
            if not os.path.isdir(path):
                os.makedirs(path)
        
        if update:
            self.updateDocs()
        
    def _getTotal(self):
        """
        전체 문서 개수 요청
        """
        res = requests.get(self.DATA_URL, { 'size': 1 })
        res.raise_for_status()
        doc = res.json()
        return doc['total'][0]['count']

    def _reqDoc(self, page, size, preprocessing=False):
        """
        문서 요청
        - page는 0 부터 시작
        - 전처리(self._preprocessing) 후 반환
        """
        page += 1
        params = {
            'sort': 'date.asc',
            'page': page,
            'size': size
        }
        res = requests.get(self.DATA_URL, params)
        res.raise_for_status()
        doc = res.json()
        
        # json to dataframe
        doc = pd.DataFrame(doc['data'], columns=KEYS.getDocKeys())
        
        # add label
        doc.insert(0, KEYS.LABEL.value, -1)
        
        if preprocessing:
            return self._preprocessing(doc)
        else:
            return doc
    
    def _preprocessing(self, doc, joinTags=True):
        """
        문서 전처리
        - KEYS 이외의 key 삭제
        - [tag] list join to string
        - [title / description / tags] 영어, 한글, 공백 이외의 것들 모두 삭제
        - html tag 삭제
        - \n, \r 삭제
        - 2번 이상의 공백 1개로 통합
        - 영어 대문자 소문자로 변환
        - 앞뒤 공백 삭제
        - text 컬럼 생성 : text = tags + title + description
        """
        
        # title, description, tags
        def textPreprocessing(x):
            x = BeautifulSoup(str(x), "html.parser").get_text()
            x = re.sub('[^가-힣a-zA-Z\s]', '', x)
            return x
        
        # all
        def docPreprocessing(x):
            x = re.sub('[\n\r]', '', x)
            x = re.sub('\s{2,}', ' ', x)
            x = x.lower()
            x = x.strip()
            return x
        
        for key in doc.columns:
            if joinTags and KEYS(key) == KEYS.TAGS:
                doc[key] = doc[key].apply(lambda x: ' '.join(x))
            if key in KEYS.getTextKeys():
                doc[key] = doc[key].apply(textPreprocessing)
                
            if key in KEYS.getDocKeys():
                doc[key] = doc[key].apply(docPreprocessing)
            
        # remove blacklist
        doc = doc.drop(doc[doc[KEYS.TITLE.value].isin(KEYS.getTitleBlackList())].index).reset_index()
                        
        # create text column
        join_with = lambda x: ' '.join(x.dropna().astype(str))
        doc[KEYS.TEXT.value] = doc[KEYS.getTextKeys()].apply(
            join_with,
            axis=1
        )
        return doc
        

    def _reqDocs(self, size, start_page=0):
        """
        전체 문서 요청
        """
        total = self._getTotal()
        if size > self.MAX_REQ_SIZE: size = self.MAX_REQ_SIZE
        total_req = round(total/size + 0.5)
        docs = pd.DataFrame()
        for i in trange(start_page, total_req):
            doc = self._reqDoc(i, size)
            if docs.empty:
                docs = doc
            else:
                docs = docs.append(doc)
        return self._preprocessing(docs)
    
    def getDocs(self, labeled_only=True):
        """
        전체 문서 조회
        labeled
        :True = 라벨링 된 데이터만 가져오기
        :False = 전체 데이터 가져오기
        """
        if not os.path.isfile(self.DOCUMENTS_PATH):
            print('> 문서가 없으므로 서버에 요청합니다.')
            self.updateDocs()
        data = pd.read_csv(self.DOCUMENTS_PATH, delimiter=',', dtype={KEYS.LABEL.value: np.int64})
        if not labeled_only:
            return data
        else:
            return data.loc[data.label != -1]
    
    def updateDocs(self):
        """
        최신 문서 추가
        - 데이터가 없는 경우, 전체 데이터를 가져옴
        - 기존 데이터가 있는 경우, 없는 데이터만 추가
        """
        size = self.MAX_REQ_SIZE
        
        if not os.path.isfile(self.DOCUMENTS_PATH):
            # 데이터가 없는 경우
            docs = self._reqDocs(size)
            docs.to_csv(self.DOCUMENTS_PATH, sep=",", index=False)
        else:
            # 기존 데이터가 있는 경우
            num_new_docs = 0
            docs = pd.read_csv(self.DOCUMENTS_PATH, delimiter=',')
            total = self._getTotal()
            total_docs = len(docs)
            new_docs_num = total - total_docs
            new_docs = self._reqDocs(size, total_docs // size)
            
            # _id가 기존 데이터에 존재하지 않는 경우에만 추가
            docs = docs.append(new_docs[~new_docs[KEYS.ID.value].isin(docs[KEYS.ID.value])])
            docs.to_csv(self.DOCUMENTS_PATH, sep=",", index=False)
            
            if total_docs == len(docs):
                print('> 문서가 최신 상태입니다.')
            else:
                print(f'> 신규 문서 {len(docs) - total_docs}개 추가')
    
    def syncDocLabel(self, old_document_path, sep, override=False):
        """
        기존 라벨링한 데이터를 신규 문서에 반영
        - title, link 기준으로 일치하는 문서 검색
        """
        
        document = pd.read_csv(self.DOCUMENTS_PATH, delimiter=',')
        old_document = pd.read_csv(old_document_path, delimiter=sep)
        self._preprocessing(old_document, joinTags=False)
        for index, row in old_document.iterrows():
            link = row.link
            title = row.title
            label = int(row.label)
            if not len(document.loc[document.title.str.strip() == title.strip()]) and not len(document.loc[document.link == link]):
                print(f'not found : {row.title}')
            elif len(document.loc[document.title.str.strip() == title.strip()]):
                document.loc[document.title.str.strip() == title.strip(), KEYS.LABEL.value] = label
            elif len(document.loc[document.link == link]):
                document.loc[document.link == link, KEYS.LABEL.value] = label
        
        # save synchronized document
        if override:
            document.to_csv(self.DOCUMENTS_PATH, sep=",", index=False)
        print('done')

Overwriting ./document.py


## word_vector.py
- [FastText wiki 한국어 데이터](https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz)

In [3]:
%%writefile ./word_vector.py
import os
import numpy as np
from util import downloadByURL
from gensim.models import FastText, fasttext # 둘이 다름 주의!

"""
FastText base word embedding
"""
class WordVector():
    
    def __init__(self):
        # corpus
        self.WIKI_KO_DATA = './data/cc.ko.300.bin.gz'
        self.WIKI_KO_DATA_URL = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ko.300.bin.gz'

        # pretrained model
        self.WIKI_KO_MODEL_PATH = f'./wv_model/ko.wiki'

    def getCustomModel(self, text, size=4, window=3, min_count=1, epochs=10):
        """
        FastText 기반 모델 학습
        """
        model = FastText(size=size, window=window, min_count=min_count)
        model.build_vocab(sentences=text)
        model.train(sentences=text, total_examples=len(text), epochs=epochs)
        return model
    
    def getWikiModel(self):
        """
        위키 한국어 데이터 기반 모델 학습
        """
        model = None
        if not os.path.isfile(self.WIKI_KO_MODEL_PATH):
            print('학습된 모델이 없습니다.')
            
            if not os.path.isfile(self.WIKI_KO_DATA):
                print('모델 학습에 필요한 데이터를 다운로드를 시작합니다.')
                downloadByURL(self.WIKI_KO_DATA_URL, self.WIKI_KO_DATA)
            
            print('모델 학습을 시작합니다.')
            model = fasttext.load_facebook_model(self.WIKI_KO_DATA)
            model.save(self.WIKI_KO_MODEL_PATH)
            
        else:
            model = FastText.load(self.WIKI_KO_MODEL_PATH)
        
        print(f'vocab size : {len(model.wv.vocab)}')
        return model
    
    def getSimilarWords(self, wv_model, word, topn=5):
        """
        유사단어 조회
        """
        return wv_model.wv.similar_by_word(word, topn)
    
    def vectorization(self, wv_model, text, embedding_dim=300):
        """
        주어진 문장을 단어별로 벡터화한 뒤 평균값을 문장의 벡터로 반환
        embedding_dim : wv_model vector의 차원 수 (wiki 기반 fasttext는 300차원)
        """
        words = text.split(' ')
        words_num = len(words)
        
        # model dimension (wiki festtext의 경우 300)
        vector = np.zeros(embedding_dim)
        for word in words:
            vector += wv_model[word]
        return vector/words_num


Overwriting ./word_vector.py


## main.py

In [2]:
# %%writefile ./main.py
from util import oneHotEncoding
from document import Document
from analysis import Analysis
from word_vector import WordVector

def ready_data():
    doc = Document()
    wv = WordVector()
    wv_model = wv.getWikiModel()
    
    embedding_dim = 300
    
    # 라벨링 된 데이터만 가져오기
    data = doc.getDocs(True) 
    
    # 전체 데이터에 대해서 분석
    # Analysis(doc.getDocs(labeled_only=False))
    
    # 임베딩 테스트
    # wv.getSimilarWords(wv_model, '파이썬', 5)
    
    # one hot encoding
    data.label = data.label.apply(lambda x: oneHotEncoding(x, 2))
    
    # vectorization
    data['vector'] = data.text.apply(lambda x: wv.vectorization(wv_model, x, embedding_dim))
    return data

In [3]:
data = ready_data()

vocab size : 2000000


## classifier.py

In [7]:
# %%writefile ./classifier.py
import numpy as np
import matplotlib.pyplot as plt
from util import reshape
from keras import backend as K
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense

class Classifier():
    
    def __init__(self):
        return
    
    def _dataSeperator(self, data, embedding_dim):
        X_train, X_test, y_train, y_test = train_test_split(data.vector,
                                                            data.label,
                                                            test_size=0.33,
                                                            random_state=321)
        X_train = reshape(X_train, embedding_dim)
        X_test = reshape(X_test, embedding_dim)
        return X_train, X_test, y_train, y_test
        
    def train(self,
              data,
              embedding_dim=300,
              epochs=75,
              batch_size=100,
              validation_split=0.3,
              verbose=0):
        # seperate data
        X_train, X_test, y_train, y_test = self._dataSeperator(data, embedding_dim)
        print(type(X_train), type(X_test), type(y_train), type(y_test))
        
        # layer
        K.clear_session()
        model = Sequential()
        model.add(Dense(embedding_dim, input_shape=(X_train.shape[1], 1), activation='relu'))
        model.add(SimpleRNN(32))
        model.add(Dense(1, activation='sigmoid'))

        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      metrics=['acc',
                               self.f1_m,
                               self.precision_m,
                               self.recall_m])

        history = model.fit(X_train,
                            np.asarray(y_train),
                            epochs=epochs,
                            batch_size=batch_size,
                            validation_split=validation_split)
        
        loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, np.asarray(y_test), verbose=verbose)
        print(f'loss : {loss}')
        print(f'accuracy : {accuracy}')
        print(f'f1_score : {f1_score}')
        print(f'precision : {precision}')
        print(f' recall : {recall}')
        
    def showHistory(self, history):
        fig, loss_ax = plt.subplots()
        acc_ax = loss_ax.twinx()

        acc_ax.plot(history.history['acc'], 'b', label='train acc')
        acc_ax.plot(history.history['val_acc'], 'g', label='val acc')
        acc_ax.set_ylabel('accuracy')
        acc_ax.legend(loc='upper left')

        plt.show()
    
    def predict(self):
        return
    
    def recall_m(self, y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(self, y_true, y_pred):
            true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
            predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
            precision = true_positives / (predicted_positives + K.epsilon())
            return precision

    def f1_m(self, y_true, y_pred):
        precision = self.precision_m(y_true, y_pred)
        recall = self.recall_m(y_true, y_pred)
        return 2 * ((precision * recall)/(precision + recall + K.epsilon()))

In [8]:
cf = Classifier()

In [9]:
cf.train(data,
         embedding_dim=300,
         epochs=2,
         batch_size=100,
         validation_split=0.3,
         verbose=0)

<class 'numpy.ndarray'> <class 'numpy.ndarray'> <class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type list).

In [11]:
X_train.shape

(6955, 300, 1)