In [3]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re

In [2]:
petition = pd.read_csv('petition.csv', parse_dates=['start','end'], index_col=0)
petition.head(3)

Unnamed: 0_level_0,start,end,answered,votes,category,title,content
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
21,2017-08-19,2017-11-17,0,9,안전/환경,스텔라 데이지호에 대한 제안입니다.,스텔라 데이지호에 대한 제안입니다.\n3월31일 스텔라 데이지호가 침몰하고 5달째가...
22,2017-08-19,2017-11-17,0,17,기타,비리제보처를 만들어주세요.,현 정부에 국민들이 가장 원하는 것은 부패척결입니다. 우리 사회에 각종 비리들이 ...
23,2017-08-19,2017-09-03,0,0,미래,제2의 개성공단,"만일 하시는 대통령님 및 각 부처 장관님,주무관님들 안녕하세요!!\n전남 목포에서 ..."


In [5]:
p = r'.*(P2P|은행|금융|주식|증권|공매도).*'
finance = petition[petition['title'].str.match(p) |
           petition['content'].str.match(p, flags=re.MULTILINE)]
finance.shape

(20619, 7)

In [6]:
def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    # 공백이 여러개 있는 것을 하나로 줄여준다.
    text = re.sub(' +', ' ', text)
    return text

In [7]:
def remove_stopwords(text):
    stops = ['수', '있는', '있습니다', '그', '년도', '에', '합니다', 
             '하는', '및', '제', '할', '하고', '더', '대한', '한', 
             '그리고', '월', '저는', '없는', '것입니다', '등', '일', 
             '많은', '이런', '것은', '왜', '같은', 
             '없습니다', '위해', '한다']
    meaningful_words = [w for w in text if not w in stops]
    return ''.join(meaningful_words)

In [9]:
finance['content_preprocessing'] = finance['content'].apply(preprocessing)
finance['content_preprocessing'] = finance['content_preprocessing'].apply(remove_stopwords)
sentences = finance['content_preprocessing']
sentences.shape

(20619,)

## BOW (Bag of Words)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = 'word', # 캐릭터 단위로 벡터화 할 수도 있습니다.
                             tokenizer = None, # 토크나이저를 따로 지정해 줄 수도 있습니다.
                             preprocessor = None, # 전처리 도구
                             stop_words = None, # 불용어 nltk등의 도구를 사용할 수도 있습니다.
                             min_df = 2, # 토큰이 나타날 최소 문서 개수로 오타나 자주 나오지 않는 특수한 전문용어 제거에 좋다. 
                             ngram_range=(1, 3), # BOW의 단위를 1~3개로 지정합니다.
                             max_features = 1000 # 만들 피처의 수, 단어의 수가 된다.
                            )
vectorizer

CountVectorizer(max_features=1000, min_df=2, ngram_range=(1, 3))

In [11]:
feature_vector = vectorizer.fit_transform(finance['content_preprocessing'])
feature_vector.shape

(20619, 1000)

In [14]:
# 각 row에서 전체 단어가방에 있는 어휘에서 등장하는 단어에 대한 one-hot-vector
vocab = vectorizer.get_feature_names()
pd.DataFrame(feature_vector[:10].toarray(), columns=vocab).head()

Unnamed: 0,articleview,articleview html,articleview html idxno,co,co kr,co kr news,com,html,html idxno,http,...,회사가,회사는,회사를,회사의,회장,훨씬,희망을,힘든,힘들게,힘을
0,0,0,0,0,0,0,0,0,0,0,...,2,1,0,7,0,1,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer(smooth_idf=False)
transformer

TfidfTransformer(smooth_idf=False)

In [16]:
feature_tfidf = transformer.fit_transform(feature_vector)
feature_tfidf.shape

(20619, 1000)

In [17]:
pd.DataFrame(feature_tfidf[:10].toarray(), columns=vocab).head()

Unnamed: 0,articleview,articleview html,articleview html idxno,co,co kr,co kr news,com,html,html idxno,http,...,회사가,회사는,회사를,회사의,회장,훨씬,희망을,힘든,힘들게,힘을
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.07372,0.040718,0.0,0.262616,0.0,0.039119,0.038407,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.17182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.074964,0.071954,0.06907,0.0,0.0,0.0,0.065831,0.068161,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


이를 통해 
- 예측 모델링의 변수로 이용할 수 있고 (예를 들어, 해당 문서에 답글이 달렸는지, 긍부정)
- 각 문서끼리의 유사성
등등을 할 수 있다.

In [19]:
tmp = pd.DataFrame(feature_tfidf[:100].toarray(), columns=vocab)

## Cosine similarity

In [28]:
from numpy.linalg import norm

In [31]:
def cos_sim(A, B):
       return np.sum((A*B)/(norm(A)*norm(B)))

In [32]:
cos_sim(tmp.iloc[0,:], tmp.iloc[1,:])

0.17921459782955934

In [66]:
def cosRecom(x):
    sim = []
    for i in range(len(tmp)):
        sim.append(cos_sim(tmp.iloc[i,:],tmp.iloc[x,:]))
    return sim

In [86]:
sim = cosRecom(0)
sim[:5]

[1.0000000000000002,
 0.17921459782955934,
 0.19334289325928156,
 0.18503145292476175,
 0.1108967599964287]