In [2]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv('petition.csv', parse_dates=['start', 'end'])

p = r'.*(P2P|은행|금융|주식|증권|공매도).*'
finance = df[df['title'].str.match(p) |
           df['content'].str.match(p, flags=re.MULTILINE)]
finance.shape

(20619, 8)

In [32]:
def preprocessing(text):
    # 개행문자 제거
    text = re.sub('\\\\n', ' ', text)
    # 특수문자 제거
    # 특수문자나 이모티콘 등은 때로는 의미를 갖기도 하지만 여기에서는 제거했습니다.
    # text = re.sub('[?.,;:|\)*~`’!^\-_+<>@\#$%&-=#}※]', '', text)
    # 한글, 영문, 숫자만 남기고 모두 제거하도록 합니다.
    # text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z0-9]', ' ', text)
    # 한글, 영문만 남기고 모두 제거하도록 합니다.
    text = re.sub('[^가-힣ㄱ-ㅎㅏ-ㅣa-zA-Z]', ' ', text)
    return text

In [33]:
def remove_stopwords(text):
    stops = ['수', '있는', '있습니다', '그', '년도', '에', '합니다', 
             '하는', '및', '제', '할', '하고', '더', '대한', '한', 
             '그리고', '월', '저는', '없는', '것입니다', '등', '일', 
             '많은', '이런', '것은', '왜', '같은', 
             '없습니다', '위해', '한다']
    meaningful_words = [w for w in text if not w in stops]
    return ''.join(meaningful_words)

In [34]:
finance['content_preprocessing'] = finance['content'].apply(preprocessing)
finance['content_preprocessing'] = finance['content_preprocessing'].apply(remove_stopwords)
sentences = finance['content_preprocessing']
sentences.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


(20619,)

In [35]:
from konlpy.tag import Mecab

tokenizer = Mecab()

tokens = sentences.apply(tokenizer.morphs)
tokens.shape

(20619,)

In [36]:
from gensim.models import Word2Vec

model = Word2Vec(tokens, min_count=2)

In [37]:
model_name = 'myFirstWord2Vec'
model.save(model_name)

In [38]:
print(f'How many words : {len(model.wv.vocab)}')

How many words : 45614


In [40]:
model.wv['주식'].shape

(100,)

In [41]:
model.wv['주식']

array([ 2.3918262e+00,  1.7614355e+00, -9.3036705e-01,  1.6763264e-01,
       -2.0786109e+00,  1.3726033e+00, -2.4452315e-01,  2.2072434e+00,
        8.5448869e-02,  1.8817682e+00, -2.6107504e+00, -2.8198178e+00,
        1.2302750e+00, -2.3065550e+00, -1.3445963e+00,  1.0059332e+00,
        7.7189493e-01, -5.7391918e-01,  1.6814463e+00,  6.4651412e-01,
       -9.5726478e-01,  3.7930554e-01,  1.7085050e+00,  1.0098234e+00,
        2.7693987e+00, -2.8526764e+00, -6.3369548e-01, -1.0701298e+00,
       -1.6354696e+00,  6.2112623e-01, -8.2971275e-01,  6.4071447e-01,
        1.3107079e+00, -1.3701034e+00, -1.4352335e-01,  1.5358517e+00,
       -1.9609819e-01, -1.6610689e+00,  5.8598530e-01,  2.5405486e+00,
        4.4289160e+00,  1.3315256e+00,  2.4916465e+00,  7.2838467e-01,
       -5.3586406e-01,  2.2291210e+00, -2.8378940e+00,  4.0778880e+00,
        8.2217228e-01,  1.8327168e-01, -6.5454990e-01, -2.4170539e+00,
       -2.3304131e+00,  3.2586048e+00, -1.0530858e+00, -4.2208109e+00,
      

In [42]:
model.wv.most_similar('주식')

[('코스닥', 0.7152674794197083),
 ('코인', 0.6482715010643005),
 ('증권', 0.6475867033004761),
 ('종목', 0.5747853517532349),
 ('유가증권', 0.5464624166488647),
 ('주가', 0.5442466735839844),
 ('현물', 0.5314433574676514),
 ('갭', 0.5297795534133911),
 ('개미', 0.52653568983078),
 ('연기금', 0.5231413841247559)]

In [43]:
model.wv.most_similar(positive=['주식', '증권'], negative=['현금'])

[('코스닥', 0.6901332139968872),
 ('오세훈', 0.5590704679489136),
 ('삼성증권', 0.5430185794830322),
 ('유가증권', 0.5267007350921631),
 ('박원순', 0.5166797637939453),
 ('조성자', 0.5089632272720337),
 ('주직', 0.5025186538696289),
 ('은미', 0.5014826655387878),
 ('ㅉㅉㅉ', 0.49340736865997314),
 ('연기금', 0.48443490266799927)]

In [44]:
model.wv.similarity('삼성증권', '이재용')

0.3414917

## 시각화

In [45]:
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import gensim 
import gensim.models as g

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
mpl.rcParams['axes.unicode_minus'] = False

model_name = 'myFirstWord2Vec'
model = g.Doc2Vec.load(model_name)

In [46]:
# Doc2Vec으로 만든 모델을 가져옴
vocab = list(model.wv.vocab)
# 모델의 단어를 피처로 지정해 준다.
X = model[vocab]

print('Feature의 길이 {}'.format(len(X)))
print(X[0][:10])
tsne = TSNE(n_components=2)

# 모든 단어를 그래프에 출력하면 글자가 너무 많이 겹치기 때문에 일부 단어에 대해서만 시각화
X_tsne = tsne.fit_transform(X[:150,:])

  after removing the cwd from sys.path.


Feature의 길이 45614
[-0.88079286  0.955231   -2.0075145   1.0422472   2.2242877   1.3103964
  1.7685103   1.9248633   1.947339    1.6871921 ]


In [48]:
df = pd.DataFrame(X_tsne, index=vocab[:150], columns=['x', 'y'])
df.shape

(150, 2)